diff --git "a/checkpoint-25354/trainer_state.json" "b/checkpoint-25354/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-25354/trainer_state.json" @@ -0,0 +1,188718 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.089327560869047, + "global_step": 25354, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-07, + "loss": 10.8508, + "theoretical_loss": 20.81281780154715, + "tokens_seen": 65536 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-06, + "loss": 10.9289, + "theoretical_loss": 17.566201104328645, + "tokens_seen": 131072 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904763e-06, + "loss": 10.8587, + "theoretical_loss": 15.939477092836569, + "tokens_seen": 196608 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-06, + "loss": 10.7215, + "theoretical_loss": 14.89231675598857, + "tokens_seen": 262144 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-06, + "loss": 10.5418, + "theoretical_loss": 14.136216937762974, + "tokens_seen": 327680 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809525e-06, + "loss": 10.4507, + "theoretical_loss": 13.552561472550224, + "tokens_seen": 393216 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-06, + "loss": 10.2227, + "theoretical_loss": 13.08180900140119, + "tokens_seen": 458752 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507936e-06, + "loss": 10.0005, + "theoretical_loss": 12.690129625483323, + "tokens_seen": 524288 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571428e-06, + "loss": 9.9334, + "theoretical_loss": 12.356592463873625, + "tokens_seen": 589824 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-06, + "loss": 9.8476, + "theoretical_loss": 12.067412607035077, + "tokens_seen": 655360 + }, + { + "epoch": 0.0, + "learning_rate": 1.0912698412698412e-05, + "loss": 9.9366, + "theoretical_loss": 11.813066231101676, + "tokens_seen": 720896 + }, + { + "epoch": 0.0, + "learning_rate": 1.1904761904761905e-05, + "loss": 9.7097, + "theoretical_loss": 11.586719208706729, + "tokens_seen": 786432 + }, + { + "epoch": 0.0, + "learning_rate": 1.2896825396825396e-05, + "loss": 9.3611, + "theoretical_loss": 11.383314140186787, + "tokens_seen": 851968 + }, + { + "epoch": 0.0, + "learning_rate": 1.3888888888888888e-05, + "loss": 9.3677, + "theoretical_loss": 11.199011702111871, + "tokens_seen": 917504 + }, + { + "epoch": 0.0, + "learning_rate": 1.4880952380952381e-05, + "loss": 9.5872, + "theoretical_loss": 11.030833917977912, + "tokens_seen": 983040 + }, + { + "epoch": 0.0, + "learning_rate": 1.5873015873015872e-05, + "loss": 9.4471, + "theoretical_loss": 10.87642808645695, + "tokens_seen": 1048576 + }, + { + "epoch": 0.0, + "learning_rate": 1.6865079365079364e-05, + "loss": 9.4183, + "theoretical_loss": 10.733905740062724, + "tokens_seen": 1114112 + }, + { + "epoch": 0.0, + "learning_rate": 1.7857142857142855e-05, + "loss": 9.5061, + "theoretical_loss": 10.60172987623028, + "tokens_seen": 1179648 + }, + { + "epoch": 0.0, + "learning_rate": 1.884920634920635e-05, + "loss": 9.0755, + "theoretical_loss": 10.478634172356642, + "tokens_seen": 1245184 + }, + { + "epoch": 0.0, + "learning_rate": 1.984126984126984e-05, + "loss": 9.1282, + "theoretical_loss": 10.36356394376333, + "tokens_seen": 1310720 + }, + { + "epoch": 0.0, + "learning_rate": 2.0833333333333333e-05, + "loss": 9.1593, + "theoretical_loss": 10.255632220896747, + "tokens_seen": 1376256 + }, + { + "epoch": 0.0, + "learning_rate": 2.1825396825396824e-05, + "loss": 9.05, + "theoretical_loss": 10.15408655327002, + "tokens_seen": 1441792 + }, + { + "epoch": 0.0, + "learning_rate": 2.2817460317460315e-05, + "loss": 9.1536, + "theoretical_loss": 10.058283561732598, + "tokens_seen": 1507328 + }, + { + "epoch": 0.0, + "learning_rate": 2.380952380952381e-05, + "loss": 9.088, + "theoretical_loss": 9.967669178840278, + "tokens_seen": 1572864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 36240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 9.318564414978027, + "objective/train/theoretical_loss": 9.881763126393109, + "objective/train/tokens_used": 22098400, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.48015873015873e-05, + "loss": 9.2884, + "theoretical_loss": 9.881763126393109, + "tokens_seen": 1638400 + }, + { + "epoch": 0.0, + "learning_rate": 2.5793650793650793e-05, + "loss": 9.2085, + "theoretical_loss": 9.80014659154056, + "tokens_seen": 1703936 + }, + { + "epoch": 0.0, + "learning_rate": 2.6785714285714284e-05, + "loss": 8.8681, + "theoretical_loss": 9.722452346907446, + "tokens_seen": 1769472 + }, + { + "epoch": 0.0, + "learning_rate": 2.7777777777777776e-05, + "loss": 8.6996, + "theoretical_loss": 9.648356759081546, + "tokens_seen": 1835008 + }, + { + "epoch": 0.0, + "learning_rate": 2.876984126984127e-05, + "loss": 9.0688, + "theoretical_loss": 9.577573271145639, + "tokens_seen": 1900544 + }, + { + "epoch": 0.0, + "learning_rate": 2.9761904761904762e-05, + "loss": 8.7821, + "theoretical_loss": 9.509847046764852, + "tokens_seen": 1966080 + }, + { + "epoch": 0.0, + "learning_rate": 3.075396825396825e-05, + "loss": 8.9255, + "theoretical_loss": 9.444950537631936, + "tokens_seen": 2031616 + }, + { + "epoch": 0.0, + "learning_rate": 3.1746031746031745e-05, + "loss": 8.8961, + "theoretical_loss": 9.382679790910457, + "tokens_seen": 2097152 + }, + { + "epoch": 0.0, + "learning_rate": 3.273809523809524e-05, + "loss": 8.9698, + "theoretical_loss": 9.32285135423398, + "tokens_seen": 2162688 + }, + { + "epoch": 0.0, + "learning_rate": 3.373015873015873e-05, + "loss": 8.3261, + "theoretical_loss": 9.265299666660276, + "tokens_seen": 2228224 + }, + { + "epoch": 0.0, + "learning_rate": 3.472222222222222e-05, + "loss": 8.8584, + "theoretical_loss": 9.209874847444755, + "tokens_seen": 2293760 + }, + { + "epoch": 0.0, + "learning_rate": 3.571428571428571e-05, + "loss": 8.443, + "theoretical_loss": 9.156440812508292, + "tokens_seen": 2359296 + }, + { + "epoch": 0.0, + "learning_rate": 3.670634920634921e-05, + "loss": 8.627, + "theoretical_loss": 9.10487366241335, + "tokens_seen": 2424832 + }, + { + "epoch": 0.0, + "learning_rate": 3.76984126984127e-05, + "loss": 8.4334, + "theoretical_loss": 9.055060296533734, + "tokens_seen": 2490368 + }, + { + "epoch": 0.0, + "learning_rate": 3.8690476190476195e-05, + "loss": 8.673, + "theoretical_loss": 9.006897216643829, + "tokens_seen": 2555904 + }, + { + "epoch": 0.0, + "learning_rate": 3.968253968253968e-05, + "loss": 8.4895, + "theoretical_loss": 8.960289489909357, + "tokens_seen": 2621440 + }, + { + "epoch": 0.0, + "learning_rate": 4.067460317460318e-05, + "loss": 8.7543, + "theoretical_loss": 8.915149846640611, + "tokens_seen": 2686976 + }, + { + "epoch": 0.0, + "learning_rate": 4.1666666666666665e-05, + "loss": 8.4625, + "theoretical_loss": 8.871397892478225, + "tokens_seen": 2752512 + }, + { + "epoch": 0.0, + "learning_rate": 4.265873015873016e-05, + "loss": 8.4903, + "theoretical_loss": 8.828959418153499, + "tokens_seen": 2818048 + }, + { + "epoch": 0.0, + "learning_rate": 4.365079365079365e-05, + "loss": 8.2531, + "theoretical_loss": 8.787765792778412, + "tokens_seen": 2883584 + }, + { + "epoch": 0.0, + "learning_rate": 4.464285714285714e-05, + "loss": 8.4917, + "theoretical_loss": 8.747753428911455, + "tokens_seen": 2949120 + }, + { + "epoch": 0.0, + "learning_rate": 4.563492063492063e-05, + "loss": 8.3923, + "theoretical_loss": 8.708863309520833, + "tokens_seen": 3014656 + }, + { + "epoch": 0.0, + "learning_rate": 4.6626984126984126e-05, + "loss": 8.2601, + "theoretical_loss": 8.671040568508847, + "tokens_seen": 3080192 + }, + { + "epoch": 0.0, + "learning_rate": 4.761904761904762e-05, + "loss": 8.2087, + "theoretical_loss": 8.634234117735474, + "tokens_seen": 3145728 + }, + { + "epoch": 0.0, + "learning_rate": 4.8611111111111115e-05, + "loss": 8.3675, + "theoretical_loss": 8.598396314536323, + "tokens_seen": 3211264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 39163, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 8.37192440032959, + "objective/train/theoretical_loss": 8.563482664611069, + "objective/train/tokens_used": 23736800, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 4.96031746031746e-05, + "loss": 8.1175, + "theoretical_loss": 8.563482664611069, + "tokens_seen": 3276800 + }, + { + "epoch": 0.0, + "learning_rate": 5.05952380952381e-05, + "loss": 8.2132, + "theoretical_loss": 8.529451555895115, + "tokens_seen": 3342336 + }, + { + "epoch": 0.0, + "learning_rate": 5.1587301587301586e-05, + "loss": 8.1212, + "theoretical_loss": 8.496264019646002, + "tokens_seen": 3407872 + }, + { + "epoch": 0.0, + "learning_rate": 5.257936507936508e-05, + "loss": 8.1695, + "theoretical_loss": 8.463883515497187, + "tokens_seen": 3473408 + }, + { + "epoch": 0.0, + "learning_rate": 5.357142857142857e-05, + "loss": 7.7933, + "theoretical_loss": 8.432275737672779, + "tokens_seen": 3538944 + }, + { + "epoch": 0.0, + "learning_rate": 5.4563492063492063e-05, + "loss": 8.009, + "theoretical_loss": 8.401408439930716, + "tokens_seen": 3604480 + }, + { + "epoch": 0.0, + "learning_rate": 5.555555555555555e-05, + "loss": 8.0148, + "theoretical_loss": 8.371251277120209, + "tokens_seen": 3670016 + }, + { + "epoch": 0.0, + "learning_rate": 5.6547619047619046e-05, + "loss": 7.856, + "theoretical_loss": 8.341775661511075, + "tokens_seen": 3735552 + }, + { + "epoch": 0.0, + "learning_rate": 5.753968253968254e-05, + "loss": 7.9824, + "theoretical_loss": 8.31295463228533, + "tokens_seen": 3801088 + }, + { + "epoch": 0.0, + "learning_rate": 5.8531746031746036e-05, + "loss": 7.8076, + "theoretical_loss": 8.284762736781182, + "tokens_seen": 3866624 + }, + { + "epoch": 0.0, + "learning_rate": 5.9523809523809524e-05, + "loss": 7.796, + "theoretical_loss": 8.257175922251864, + "tokens_seen": 3932160 + }, + { + "epoch": 0.0, + "learning_rate": 6.051587301587302e-05, + "loss": 7.7324, + "theoretical_loss": 8.230171437050114, + "tokens_seen": 3997696 + }, + { + "epoch": 0.0, + "learning_rate": 6.15079365079365e-05, + "loss": 7.7033, + "theoretical_loss": 8.20372774027797, + "tokens_seen": 4063232 + }, + { + "epoch": 0.0, + "learning_rate": 6.25e-05, + "loss": 7.7943, + "theoretical_loss": 8.177824419053046, + "tokens_seen": 4128768 + }, + { + "epoch": 0.0, + "learning_rate": 6.349206349206349e-05, + "loss": 7.7362, + "theoretical_loss": 8.152442112639616, + "tokens_seen": 4194304 + }, + { + "epoch": 0.0, + "learning_rate": 6.448412698412699e-05, + "loss": 7.6756, + "theoretical_loss": 8.1275624427775, + "tokens_seen": 4259840 + }, + { + "epoch": 0.0, + "learning_rate": 6.547619047619048e-05, + "loss": 7.4415, + "theoretical_loss": 8.10316794961571, + "tokens_seen": 4325376 + }, + { + "epoch": 0.0, + "learning_rate": 6.646825396825397e-05, + "loss": 7.6631, + "theoretical_loss": 8.07924203272264, + "tokens_seen": 4390912 + }, + { + "epoch": 0.0, + "learning_rate": 6.746031746031745e-05, + "loss": 7.4983, + "theoretical_loss": 8.055768896701416, + "tokens_seen": 4456448 + }, + { + "epoch": 0.0, + "learning_rate": 6.845238095238096e-05, + "loss": 7.4185, + "theoretical_loss": 8.032733500989007, + "tokens_seen": 4521984 + }, + { + "epoch": 0.0, + "learning_rate": 6.944444444444444e-05, + "loss": 7.492, + "theoretical_loss": 8.010121513461836, + "tokens_seen": 4587520 + }, + { + "epoch": 0.0, + "learning_rate": 7.043650793650793e-05, + "loss": 7.3747, + "theoretical_loss": 7.987919267509379, + "tokens_seen": 4653056 + }, + { + "epoch": 0.0, + "learning_rate": 7.142857142857142e-05, + "loss": 7.1305, + "theoretical_loss": 7.966113722271801, + "tokens_seen": 4718592 + }, + { + "epoch": 0.0, + "learning_rate": 7.242063492063492e-05, + "loss": 7.408, + "theoretical_loss": 7.944692425767988, + "tokens_seen": 4784128 + }, + { + "epoch": 0.0, + "learning_rate": 7.341269841269842e-05, + "loss": 7.1773, + "theoretical_loss": 7.9236434806675184, + "tokens_seen": 4849664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 40621, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.692591667175293, + "objective/train/theoretical_loss": 7.902955512484067, + "objective/train/tokens_used": 25375200, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.440476190476191e-05, + "loss": 7.3086, + "theoretical_loss": 7.902955512484067, + "tokens_seen": 4915200 + }, + { + "epoch": 0.0, + "learning_rate": 7.53968253968254e-05, + "loss": 7.4029, + "theoretical_loss": 7.882617639989203, + "tokens_seen": 4980736 + }, + { + "epoch": 0.0, + "learning_rate": 7.63888888888889e-05, + "loss": 7.3189, + "theoretical_loss": 7.862619447664628, + "tokens_seen": 5046272 + }, + { + "epoch": 0.0, + "learning_rate": 7.738095238095239e-05, + "loss": 7.3031, + "theoretical_loss": 7.842950960027937, + "tokens_seen": 5111808 + }, + { + "epoch": 0.0, + "learning_rate": 7.837301587301588e-05, + "loss": 7.3367, + "theoretical_loss": 7.823602617682313, + "tokens_seen": 5177344 + }, + { + "epoch": 0.0, + "learning_rate": 7.936507936507937e-05, + "loss": 7.1338, + "theoretical_loss": 7.804565254954165, + "tokens_seen": 5242880 + }, + { + "epoch": 0.0, + "learning_rate": 8.035714285714287e-05, + "loss": 7.1897, + "theoretical_loss": 7.7858300789950725, + "tokens_seen": 5308416 + }, + { + "epoch": 0.0, + "learning_rate": 8.134920634920635e-05, + "loss": 7.076, + "theoretical_loss": 7.767388650235364, + "tokens_seen": 5373952 + }, + { + "epoch": 0.0, + "learning_rate": 8.234126984126984e-05, + "loss": 7.0319, + "theoretical_loss": 7.749232864086619, + "tokens_seen": 5439488 + }, + { + "epoch": 0.0, + "learning_rate": 8.333333333333333e-05, + "loss": 6.8697, + "theoretical_loss": 7.731354933799318, + "tokens_seen": 5505024 + }, + { + "epoch": 0.0, + "learning_rate": 8.432539682539683e-05, + "loss": 7.1683, + "theoretical_loss": 7.71374737438992, + "tokens_seen": 5570560 + }, + { + "epoch": 0.0, + "learning_rate": 8.531746031746032e-05, + "loss": 7.1308, + "theoretical_loss": 7.696402987558934, + "tokens_seen": 5636096 + }, + { + "epoch": 0.0, + "learning_rate": 8.630952380952381e-05, + "loss": 7.1599, + "theoretical_loss": 7.679314847528181, + "tokens_seen": 5701632 + }, + { + "epoch": 0.0, + "learning_rate": 8.73015873015873e-05, + "loss": 7.0402, + "theoretical_loss": 7.662476287731328, + "tokens_seen": 5767168 + }, + { + "epoch": 0.0, + "learning_rate": 8.82936507936508e-05, + "loss": 6.8803, + "theoretical_loss": 7.645880888297279, + "tokens_seen": 5832704 + }, + { + "epoch": 0.0, + "learning_rate": 8.928571428571429e-05, + "loss": 7.0719, + "theoretical_loss": 7.629522464270861, + "tokens_seen": 5898240 + }, + { + "epoch": 0.0, + "learning_rate": 9.027777777777777e-05, + "loss": 6.941, + "theoretical_loss": 7.613395054519696, + "tokens_seen": 5963776 + }, + { + "epoch": 0.0, + "learning_rate": 9.126984126984126e-05, + "loss": 6.8738, + "theoretical_loss": 7.59749291128028, + "tokens_seen": 6029312 + }, + { + "epoch": 0.0, + "learning_rate": 9.226190476190476e-05, + "loss": 7.0739, + "theoretical_loss": 7.581810490299888, + "tokens_seen": 6094848 + }, + { + "epoch": 0.0, + "learning_rate": 9.325396825396825e-05, + "loss": 6.8629, + "theoretical_loss": 7.5663424415343705, + "tokens_seen": 6160384 + }, + { + "epoch": 0.0, + "learning_rate": 9.424603174603175e-05, + "loss": 7.1214, + "theoretical_loss": 7.551083600364949, + "tokens_seen": 6225920 + }, + { + "epoch": 0.0, + "learning_rate": 9.523809523809524e-05, + "loss": 6.9486, + "theoretical_loss": 7.536028979299919, + "tokens_seen": 6291456 + }, + { + "epoch": 0.0, + "learning_rate": 9.623015873015874e-05, + "loss": 6.7577, + "theoretical_loss": 7.521173760129762, + "tokens_seen": 6356992 + }, + { + "epoch": 0.0, + "learning_rate": 9.722222222222223e-05, + "loss": 6.837, + "theoretical_loss": 7.506513286506497, + "tokens_seen": 6422528 + }, + { + "epoch": 0.0, + "learning_rate": 9.821428571428572e-05, + "loss": 6.9609, + "theoretical_loss": 7.492043056920249, + "tokens_seen": 6488064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 44240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 7.047701835632324, + "objective/train/theoretical_loss": 7.4777587180480305, + "objective/train/tokens_used": 27013600, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 9.92063492063492e-05, + "loss": 6.7018, + "theoretical_loss": 7.4777587180480305, + "tokens_seen": 6553600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010019841269841271, + "loss": 6.9157, + "theoretical_loss": 7.463656058451462, + "tokens_seen": 6619136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001011904761904762, + "loss": 6.8612, + "theoretical_loss": 7.449731002601916, + "tokens_seen": 6684672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010218253968253968, + "loss": 6.6754, + "theoretical_loss": 7.435979605213019, + "tokens_seen": 6750208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010317460317460317, + "loss": 6.8918, + "theoretical_loss": 7.422398045861905, + "tokens_seen": 6815744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010416666666666667, + "loss": 6.7951, + "theoretical_loss": 7.408982623881875, + "tokens_seen": 6881280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010515873015873016, + "loss": 6.5998, + "theoretical_loss": 7.395729753510345, + "tokens_seen": 6946816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010615079365079365, + "loss": 6.7879, + "theoretical_loss": 7.3826359592770325, + "tokens_seen": 7012352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010714285714285714, + "loss": 6.6789, + "theoretical_loss": 7.369697871618373, + "tokens_seen": 7077888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010813492063492064, + "loss": 6.808, + "theoretical_loss": 7.3569122227050885, + "tokens_seen": 7143424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010912698412698413, + "loss": 6.6585, + "theoretical_loss": 7.3442758424706875, + "tokens_seen": 7208960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011011904761904761, + "loss": 6.8593, + "theoretical_loss": 7.331785654829519, + "tokens_seen": 7274496 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001111111111111111, + "loss": 6.7283, + "theoretical_loss": 7.319438674073677, + "tokens_seen": 7340032 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001121031746031746, + "loss": 6.8853, + "theoretical_loss": 7.307232001438824, + "tokens_seen": 7405568 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011309523809523809, + "loss": 6.7527, + "theoretical_loss": 7.295162821829564, + "tokens_seen": 7471104 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011408730158730158, + "loss": 6.6535, + "theoretical_loss": 7.283228400695652, + "tokens_seen": 7536640 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011507936507936508, + "loss": 6.893, + "theoretical_loss": 7.271426081050832, + "tokens_seen": 7602176 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011607142857142858, + "loss": 6.7886, + "theoretical_loss": 7.259753280626623, + "tokens_seen": 7667712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011706349206349207, + "loss": 6.7507, + "theoretical_loss": 7.24820748915387, + "tokens_seen": 7733248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011805555555555556, + "loss": 6.4954, + "theoretical_loss": 7.236786265765262, + "tokens_seen": 7798784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011904761904761905, + "loss": 6.7152, + "theoretical_loss": 7.225487236512497, + "tokens_seen": 7864320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012003968253968255, + "loss": 6.3701, + "theoretical_loss": 7.21430809199212, + "tokens_seen": 7929856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012103174603174604, + "loss": 6.5083, + "theoretical_loss": 7.2032465850744005, + "tokens_seen": 7995392 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012202380952380953, + "loss": 6.5861, + "theoretical_loss": 7.192300528730015, + "tokens_seen": 8060928 + }, + { + "epoch": 0.0, + "learning_rate": 0.000123015873015873, + "loss": 6.8303, + "theoretical_loss": 7.1814677939495155, + "tokens_seen": 8126464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 47165, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.666480541229248, + "objective/train/theoretical_loss": 7.1707463077509646, + "objective/train/tokens_used": 28652000, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001240079365079365, + "loss": 6.7611, + "theoretical_loss": 7.1707463077509646, + "tokens_seen": 8192000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000125, + "loss": 6.672, + "theoretical_loss": 7.160134051271272, + "tokens_seen": 8257536 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001259920634920635, + "loss": 6.637, + "theoretical_loss": 7.149629057937138, + "tokens_seen": 8323072 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012698412698412698, + "loss": 6.4638, + "theoretical_loss": 7.139229411711638, + "tokens_seen": 8388608 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012797619047619048, + "loss": 6.7377, + "theoretical_loss": 7.128933245412794, + "tokens_seen": 8454144 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012896825396825398, + "loss": 6.4198, + "theoretical_loss": 7.118738739100616, + "tokens_seen": 8519680 + }, + { + "epoch": 0.0, + "learning_rate": 0.00012996031746031748, + "loss": 6.861, + "theoretical_loss": 7.1086441185293445, + "tokens_seen": 8585216 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013095238095238096, + "loss": 6.2101, + "theoretical_loss": 7.09864765366177, + "tokens_seen": 8650752 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013194444444444446, + "loss": 6.3028, + "theoretical_loss": 7.088747657242693, + "tokens_seen": 8716288 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013293650793650793, + "loss": 6.6719, + "theoretical_loss": 7.078942483428749, + "tokens_seen": 8781824 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013392857142857144, + "loss": 6.5809, + "theoretical_loss": 7.069230526471966, + "tokens_seen": 8847360 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001349206349206349, + "loss": 6.5982, + "theoretical_loss": 7.059610219454568, + "tokens_seen": 8912896 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001359126984126984, + "loss": 6.5841, + "theoretical_loss": 7.0500800330726685, + "tokens_seen": 8978432 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001369047619047619, + "loss": 6.4125, + "theoretical_loss": 7.040638474466625, + "tokens_seen": 9043968 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013789682539682541, + "loss": 6.4054, + "theoretical_loss": 7.031284086095933, + "tokens_seen": 9109504 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001388888888888889, + "loss": 6.5818, + "theoretical_loss": 7.022015444656678, + "tokens_seen": 9175040 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001398809523809524, + "loss": 6.6639, + "theoretical_loss": 7.012831160039609, + "tokens_seen": 9240576 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014087301587301586, + "loss": 6.5274, + "theoretical_loss": 7.003729874327071, + "tokens_seen": 9306112 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014186507936507937, + "loss": 6.6328, + "theoretical_loss": 6.994710260827057, + "tokens_seen": 9371648 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014285714285714284, + "loss": 6.5911, + "theoretical_loss": 6.98577102314278, + "tokens_seen": 9437184 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014384920634920634, + "loss": 6.5018, + "theoretical_loss": 6.976910894276189, + "tokens_seen": 9502720 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014484126984126984, + "loss": 6.6038, + "theoretical_loss": 6.968128635764015, + "tokens_seen": 9568256 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014583333333333335, + "loss": 6.3549, + "theoretical_loss": 6.959423036844894, + "tokens_seen": 9633792 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014682539682539685, + "loss": 6.5007, + "theoretical_loss": 6.950792913656309, + "tokens_seen": 9699328 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014781746031746032, + "loss": 6.3726, + "theoretical_loss": 6.942237108460029, + "tokens_seen": 9764864 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 48678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.25482177734375, + "objective/train/theoretical_loss": 6.9337544888949, + "objective/train/tokens_used": 30290400, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014880952380952382, + "loss": 6.4159, + "theoretical_loss": 6.9337544888949, + "tokens_seen": 9830400 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001498015873015873, + "loss": 6.4039, + "theoretical_loss": 6.925343947255817, + "tokens_seen": 9895936 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001507936507936508, + "loss": 6.4077, + "theoretical_loss": 6.917004399797798, + "tokens_seen": 9961472 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015178571428571427, + "loss": 6.3214, + "theoretical_loss": 6.908734786064147, + "tokens_seen": 10027008 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001527777777777778, + "loss": 6.3855, + "theoretical_loss": 6.900534068237688, + "tokens_seen": 10092544 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015376984126984128, + "loss": 6.4166, + "theoretical_loss": 6.89240123051416, + "tokens_seen": 10158080 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015476190476190478, + "loss": 6.4503, + "theoretical_loss": 6.884335278496871, + "tokens_seen": 10223616 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015575396825396825, + "loss": 6.2751, + "theoretical_loss": 6.87633523861175, + "tokens_seen": 10289152 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015674603174603175, + "loss": 6.3163, + "theoretical_loss": 6.868400157541997, + "tokens_seen": 10354688 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015773809523809523, + "loss": 6.2676, + "theoretical_loss": 6.860529101681551, + "tokens_seen": 10420224 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015873015873015873, + "loss": 6.3933, + "theoretical_loss": 6.85272115660663, + "tokens_seen": 10485760 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001597222222222222, + "loss": 6.3554, + "theoretical_loss": 6.844975426564642, + "tokens_seen": 10551296 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016071428571428573, + "loss": 6.3555, + "theoretical_loss": 6.8372910339797945, + "tokens_seen": 10616832 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001617063492063492, + "loss": 6.2346, + "theoretical_loss": 6.829667118974749, + "tokens_seen": 10682368 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001626984126984127, + "loss": 6.2845, + "theoretical_loss": 6.8221028389077185, + "tokens_seen": 10747904 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016369047619047618, + "loss": 6.1174, + "theoretical_loss": 6.814597367924395, + "tokens_seen": 10813440 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016468253968253969, + "loss": 6.4229, + "theoretical_loss": 6.807149896524181, + "tokens_seen": 10878976 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016567460317460316, + "loss": 6.4233, + "theoretical_loss": 6.799759631140145, + "tokens_seen": 10944512 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016666666666666666, + "loss": 6.1697, + "theoretical_loss": 6.7924257937322245, + "tokens_seen": 11010048 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016765873015873016, + "loss": 6.381, + "theoretical_loss": 6.785147621393148, + "tokens_seen": 11075584 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016865079365079366, + "loss": 6.2966, + "theoretical_loss": 6.777924365966638, + "tokens_seen": 11141120 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016964285714285717, + "loss": 6.4634, + "theoretical_loss": 6.770755293677423, + "tokens_seen": 11206656 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017063492063492064, + "loss": 6.4995, + "theoretical_loss": 6.763639684772625, + "tokens_seen": 11272192 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017162698412698414, + "loss": 6.4358, + "theoretical_loss": 6.756576833174123, + "tokens_seen": 11337728 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017261904761904762, + "loss": 5.979, + "theoretical_loss": 6.749566046141486, + "tokens_seen": 11403264 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 51612, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.18726921081543, + "objective/train/theoretical_loss": 6.7426066439450905, + "objective/train/tokens_used": 31928800, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017361111111111112, + "loss": 6.4005, + "theoretical_loss": 6.7426066439450905, + "tokens_seen": 11468800 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001746031746031746, + "loss": 6.3392, + "theoretical_loss": 6.735697959549075, + "tokens_seen": 11534336 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001755952380952381, + "loss": 6.0357, + "theoretical_loss": 6.728839338303761, + "tokens_seen": 11599872 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001765873015873016, + "loss": 6.3335, + "theoretical_loss": 6.722030137647226, + "tokens_seen": 11665408 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001775793650793651, + "loss": 6.2363, + "theoretical_loss": 6.715269726815689, + "tokens_seen": 11730944 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017857142857142857, + "loss": 6.335, + "theoretical_loss": 6.7085574865624125, + "tokens_seen": 11796480 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017956349206349207, + "loss": 6.3126, + "theoretical_loss": 6.701892808884824, + "tokens_seen": 11862016 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018055555555555555, + "loss": 5.9011, + "theoretical_loss": 6.695275096759559, + "tokens_seen": 11927552 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018154761904761905, + "loss": 6.2415, + "theoretical_loss": 6.68870376388518, + "tokens_seen": 11993088 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018253968253968252, + "loss": 6.1133, + "theoretical_loss": 6.682178234432274, + "tokens_seen": 12058624 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018353174603174602, + "loss": 6.2673, + "theoretical_loss": 6.675697942800715, + "tokens_seen": 12124160 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018452380952380953, + "loss": 6.1364, + "theoretical_loss": 6.669262333383815, + "tokens_seen": 12189696 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018551587301587303, + "loss": 6.3406, + "theoretical_loss": 6.662870860339158, + "tokens_seen": 12255232 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001865079365079365, + "loss": 6.2332, + "theoretical_loss": 6.656522987365879, + "tokens_seen": 12320768 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001875, + "loss": 6.2008, + "theoretical_loss": 6.6502181874881705, + "tokens_seen": 12386304 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001884920634920635, + "loss": 6.198, + "theoretical_loss": 6.643955942844831, + "tokens_seen": 12451840 + }, + { + "epoch": 0.0, + "learning_rate": 0.00018948412698412698, + "loss": 6.2908, + "theoretical_loss": 6.637735744484626, + "tokens_seen": 12517376 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019047619047619048, + "loss": 6.2906, + "theoretical_loss": 6.631557092167304, + "tokens_seen": 12582912 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019146825396825398, + "loss": 6.1216, + "theoretical_loss": 6.625419494170049, + "tokens_seen": 12648448 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019246031746031748, + "loss": 6.416, + "theoretical_loss": 6.619322467099223, + "tokens_seen": 12713984 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019345238095238096, + "loss": 6.0858, + "theoretical_loss": 6.613265535707211, + "tokens_seen": 12779520 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019444444444444446, + "loss": 6.0253, + "theoretical_loss": 6.607248232714213, + "tokens_seen": 12845056 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019543650793650793, + "loss": 6.1011, + "theoretical_loss": 6.60127009863481, + "tokens_seen": 12910592 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019642857142857144, + "loss": 6.0031, + "theoretical_loss": 6.59533068160918, + "tokens_seen": 12976128 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001974206349206349, + "loss": 6.1077, + "theoretical_loss": 6.589429537238785, + "tokens_seen": 13041664 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 54492, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.215681076049805, + "objective/train/theoretical_loss": 6.583566228426414, + "objective/train/tokens_used": 33567200, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001984126984126984, + "loss": 6.3003, + "theoretical_loss": 6.583566228426414, + "tokens_seen": 13107200 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019940476190476191, + "loss": 6.2773, + "theoretical_loss": 6.5777403252204305, + "tokens_seen": 13172736 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020039682539682542, + "loss": 5.9944, + "theoretical_loss": 6.571951404663098, + "tokens_seen": 13238272 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002013888888888889, + "loss": 6.1635, + "theoretical_loss": 6.566199050642863, + "tokens_seen": 13303808 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002023809523809524, + "loss": 5.9796, + "theoretical_loss": 6.560482853750463, + "tokens_seen": 13369344 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020337301587301587, + "loss": 5.9366, + "theoretical_loss": 6.554802411138745, + "tokens_seen": 13434880 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020436507936507937, + "loss": 6.0629, + "theoretical_loss": 6.549157326386091, + "tokens_seen": 13500416 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020535714285714284, + "loss": 6.0184, + "theoretical_loss": 6.54354720936333, + "tokens_seen": 13565952 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020634920634920634, + "loss": 5.8987, + "theoretical_loss": 6.537971676104026, + "tokens_seen": 13631488 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020734126984126985, + "loss": 5.6595, + "theoretical_loss": 6.532430348678068, + "tokens_seen": 13697024 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020833333333333335, + "loss": 6.0039, + "theoretical_loss": 6.5269228550684195, + "tokens_seen": 13762560 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020932539682539685, + "loss": 6.3818, + "theoretical_loss": 6.521448829050978, + "tokens_seen": 13828096 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021031746031746032, + "loss": 5.9476, + "theoretical_loss": 6.516007910077416, + "tokens_seen": 13893632 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021130952380952382, + "loss": 6.1548, + "theoretical_loss": 6.51059974316095, + "tokens_seen": 13959168 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002123015873015873, + "loss": 6.0151, + "theoretical_loss": 6.50522397876491, + "tokens_seen": 14024704 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002132936507936508, + "loss": 5.9703, + "theoretical_loss": 6.499880272694068, + "tokens_seen": 14090240 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021428571428571427, + "loss": 6.2835, + "theoretical_loss": 6.494568285988618, + "tokens_seen": 14155776 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002152777777777778, + "loss": 6.0449, + "theoretical_loss": 6.489287684820745, + "tokens_seen": 14221312 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021626984126984128, + "loss": 5.7232, + "theoretical_loss": 6.484038140393699, + "tokens_seen": 14286848 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021726190476190478, + "loss": 5.997, + "theoretical_loss": 6.4788193288433105, + "tokens_seen": 14352384 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021825396825396825, + "loss": 6.1437, + "theoretical_loss": 6.473630931141869, + "tokens_seen": 14417920 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021924603174603176, + "loss": 6.2399, + "theoretical_loss": 6.468472633004308, + "tokens_seen": 14483456 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022023809523809523, + "loss": 5.9867, + "theoretical_loss": 6.463344124796616, + "tokens_seen": 14548992 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022123015873015873, + "loss": 6.2586, + "theoretical_loss": 6.45824510144643, + "tokens_seen": 14614528 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002222222222222222, + "loss": 6.0073, + "theoretical_loss": 6.45317526235573, + "tokens_seen": 14680064 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 56313, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.538941383361816, + "objective/train/theoretical_loss": 6.448134311315593, + "objective/train/tokens_used": 35205600, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022321428571428573, + "loss": 5.8969, + "theoretical_loss": 6.448134311315593, + "tokens_seen": 14745600 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002242063492063492, + "loss": 5.9841, + "theoretical_loss": 6.443121956422939, + "tokens_seen": 14811136 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002251984126984127, + "loss": 6.0488, + "theoretical_loss": 6.438137909999214, + "tokens_seen": 14876672 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022619047619047618, + "loss": 6.0182, + "theoretical_loss": 6.433181888510964, + "tokens_seen": 14942208 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022718253968253969, + "loss": 6.0161, + "theoretical_loss": 6.428253612492239, + "tokens_seen": 15007744 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022817460317460316, + "loss": 6.0139, + "theoretical_loss": 6.4233528064687855, + "tokens_seen": 15073280 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022916666666666666, + "loss": 5.8365, + "theoretical_loss": 6.418479198883969, + "tokens_seen": 15138816 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023015873015873016, + "loss": 5.8087, + "theoretical_loss": 6.413632522026391, + "tokens_seen": 15204352 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023115079365079367, + "loss": 5.9756, + "theoretical_loss": 6.40881251195914, + "tokens_seen": 15269888 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023214285714285717, + "loss": 6.161, + "theoretical_loss": 6.404018908450656, + "tokens_seen": 15335424 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023313492063492064, + "loss": 5.9309, + "theoretical_loss": 6.399251454907132, + "tokens_seen": 15400960 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023412698412698414, + "loss": 5.7438, + "theoretical_loss": 6.394509898306452, + "tokens_seen": 15466496 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023511904761904762, + "loss": 5.9422, + "theoretical_loss": 6.389793989133574, + "tokens_seen": 15532032 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023611111111111112, + "loss": 5.8497, + "theoretical_loss": 6.385103481317387, + "tokens_seen": 15597568 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002371031746031746, + "loss": 5.7365, + "theoretical_loss": 6.380438132168923, + "tokens_seen": 15663104 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002380952380952381, + "loss": 6.0026, + "theoretical_loss": 6.375797702320966, + "tokens_seen": 15728640 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002390873015873016, + "loss": 6.032, + "theoretical_loss": 6.371181955668966, + "tokens_seen": 15794176 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002400793650793651, + "loss": 6.1233, + "theoretical_loss": 6.366590659313248, + "tokens_seen": 15859712 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024107142857142857, + "loss": 5.9843, + "theoretical_loss": 6.36202358350248, + "tokens_seen": 15925248 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024206349206349207, + "loss": 6.12, + "theoretical_loss": 6.357480501578371, + "tokens_seen": 15990784 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024305555555555555, + "loss": 5.6339, + "theoretical_loss": 6.352961189921553, + "tokens_seen": 16056320 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024404761904761905, + "loss": 5.8474, + "theoretical_loss": 6.348465427898629, + "tokens_seen": 16121856 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024503968253968255, + "loss": 6.083, + "theoretical_loss": 6.343992997810366, + "tokens_seen": 16187392 + }, + { + "epoch": 0.0, + "learning_rate": 0.000246031746031746, + "loss": 5.8857, + "theoretical_loss": 6.33954368484097, + "tokens_seen": 16252928 + }, + { + "epoch": 0.0, + "learning_rate": 0.00024702380952380955, + "loss": 5.9821, + "theoretical_loss": 6.33511727700846, + "tokens_seen": 16318464 + }, + { + "epoch": 0.0, + "objective/train/docs_used": 59194, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.582751274108887, + "objective/train/theoretical_loss": 6.330713565116083, + "objective/train/tokens_used": 36844000, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.000248015873015873, + "loss": 5.7229, + "theoretical_loss": 6.330713565116083, + "tokens_seen": 16384000 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002490079365079365, + "loss": 5.7559, + "theoretical_loss": 6.326332342704751, + "tokens_seen": 16449536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025, + "loss": 5.7506, + "theoretical_loss": 6.32197340600647, + "tokens_seen": 16515072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002509920634920635, + "loss": 5.903, + "theoretical_loss": 6.3176365538987636, + "tokens_seen": 16580608 + }, + { + "epoch": 0.01, + "learning_rate": 0.000251984126984127, + "loss": 6.0471, + "theoretical_loss": 6.313321587860021, + "tokens_seen": 16646144 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025297619047619046, + "loss": 5.9997, + "theoretical_loss": 6.309028311925785, + "tokens_seen": 16711680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025396825396825396, + "loss": 5.9546, + "theoretical_loss": 6.304756532645939, + "tokens_seen": 16777216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025496031746031746, + "loss": 6.0752, + "theoretical_loss": 6.300506059042775, + "tokens_seen": 16842752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025595238095238096, + "loss": 6.0607, + "theoretical_loss": 6.296276702569918, + "tokens_seen": 16908288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002569444444444444, + "loss": 5.8776, + "theoretical_loss": 6.292068277072099, + "tokens_seen": 16973824 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025793650793650796, + "loss": 5.7805, + "theoretical_loss": 6.28788059874573, + "tokens_seen": 17039360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025892857142857146, + "loss": 5.8387, + "theoretical_loss": 6.283713486100297, + "tokens_seen": 17104896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025992063492063497, + "loss": 5.9433, + "theoretical_loss": 6.279566759920507, + "tokens_seen": 17170432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002609126984126984, + "loss": 6.0323, + "theoretical_loss": 6.275440243229228, + "tokens_seen": 17235968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002619047619047619, + "loss": 5.8229, + "theoretical_loss": 6.271333761251142, + "tokens_seen": 17301504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002628968253968254, + "loss": 5.8986, + "theoretical_loss": 6.267247141377137, + "tokens_seen": 17367040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002638888888888889, + "loss": 5.653, + "theoretical_loss": 6.2631802131294085, + "tokens_seen": 17432576 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026488095238095237, + "loss": 5.8062, + "theoretical_loss": 6.259132808127246, + "tokens_seen": 17498112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026587301587301587, + "loss": 5.6814, + "theoretical_loss": 6.255104760053497, + "tokens_seen": 17563648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026686507936507937, + "loss": 5.8265, + "theoretical_loss": 6.251095904621689, + "tokens_seen": 17629184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00026785714285714287, + "loss": 5.8761, + "theoretical_loss": 6.247106079543801, + "tokens_seen": 17694720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002688492063492063, + "loss": 5.8617, + "theoretical_loss": 6.243135124498652, + "tokens_seen": 17760256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002698412698412698, + "loss": 5.9156, + "theoretical_loss": 6.239182881100916, + "tokens_seen": 17825792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002708333333333333, + "loss": 5.7191, + "theoretical_loss": 6.235249192870732, + "tokens_seen": 17891328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002718253968253968, + "loss": 5.8078, + "theoretical_loss": 6.231333905203899, + "tokens_seen": 17956864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 62100, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 6.079071521759033, + "objective/train/theoretical_loss": 6.227436865342643, + "objective/train/tokens_used": 38482400, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002728174603174603, + "loss": 5.6676, + "theoretical_loss": 6.227436865342643, + "tokens_seen": 18022400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002738095238095238, + "loss": 5.6418, + "theoretical_loss": 6.223557922346955, + "tokens_seen": 18087936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002748015873015873, + "loss": 5.9717, + "theoretical_loss": 6.219696927066456, + "tokens_seen": 18153472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027579365079365083, + "loss": 5.8694, + "theoretical_loss": 6.215853732112821, + "tokens_seen": 18219008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00027678571428571433, + "loss": 5.8526, + "theoretical_loss": 6.212028191832702, + "tokens_seen": 18284544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002777777777777778, + "loss": 5.6756, + "theoretical_loss": 6.208220162281178, + "tokens_seen": 18350080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002787698412698413, + "loss": 5.7205, + "theoretical_loss": 6.204429501195701, + "tokens_seen": 18415616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002797619047619048, + "loss": 5.8596, + "theoretical_loss": 6.20065606797053, + "tokens_seen": 18481152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002807539682539683, + "loss": 5.8051, + "theoretical_loss": 6.19689972363164, + "tokens_seen": 18546688 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028174603174603173, + "loss": 5.9718, + "theoretical_loss": 6.1931603308120975, + "tokens_seen": 18612224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028273809523809523, + "loss": 5.745, + "theoretical_loss": 6.189437753727901, + "tokens_seen": 18677760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028373015873015873, + "loss": 5.9058, + "theoretical_loss": 6.185731858154261, + "tokens_seen": 18743296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028472222222222223, + "loss": 5.7632, + "theoretical_loss": 6.182042511402313, + "tokens_seen": 18808832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002857142857142857, + "loss": 5.3538, + "theoretical_loss": 6.17836958229627, + "tokens_seen": 18874368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002867063492063492, + "loss": 5.7002, + "theoretical_loss": 6.1747129411509825, + "tokens_seen": 18939904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002876984126984127, + "loss": 5.6034, + "theoretical_loss": 6.171072459749913, + "tokens_seen": 19005440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002886904761904762, + "loss": 5.8292, + "theoretical_loss": 6.1674480113235095, + "tokens_seen": 19070976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002896825396825397, + "loss": 5.6209, + "theoretical_loss": 6.163839470527964, + "tokens_seen": 19136512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002906746031746032, + "loss": 5.6608, + "theoretical_loss": 6.160246713424372, + "tokens_seen": 19202048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002916666666666667, + "loss": 5.7117, + "theoretical_loss": 6.156669617458243, + "tokens_seen": 19267584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002926587301587302, + "loss": 5.9299, + "theoretical_loss": 6.153108061439397, + "tokens_seen": 19333120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002936507936507937, + "loss": 5.7272, + "theoretical_loss": 6.149561925522211, + "tokens_seen": 19398656 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029464285714285714, + "loss": 5.8686, + "theoretical_loss": 6.146031091186222, + "tokens_seen": 19464192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029563492063492064, + "loss": 5.7948, + "theoretical_loss": 6.142515441217064, + "tokens_seen": 19529728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029662698412698414, + "loss": 5.4542, + "theoretical_loss": 6.1390148596877605, + "tokens_seen": 19595264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 64900, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.347830295562744, + "objective/train/theoretical_loss": 6.135529231940326, + "objective/train/tokens_used": 40120800, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029761904761904765, + "loss": 5.5229, + "theoretical_loss": 6.135529231940326, + "tokens_seen": 19660800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002986111111111111, + "loss": 5.7379, + "theoretical_loss": 6.132058444567705, + "tokens_seen": 19726336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002996031746031746, + "loss": 5.8225, + "theoretical_loss": 6.128602385396022, + "tokens_seen": 19791872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003005952380952381, + "loss": 5.4563, + "theoretical_loss": 6.125160943467138, + "tokens_seen": 19857408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003015873015873016, + "loss": 5.8875, + "theoretical_loss": 6.121734009021521, + "tokens_seen": 19922944 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030257936507936505, + "loss": 5.7803, + "theoretical_loss": 6.118321473481398, + "tokens_seen": 19988480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030357142857142855, + "loss": 5.6634, + "theoretical_loss": 6.114923229434213, + "tokens_seen": 20054016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030456349206349205, + "loss": 5.7123, + "theoretical_loss": 6.111539170616359, + "tokens_seen": 20119552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003055555555555556, + "loss": 6.0343, + "theoretical_loss": 6.108169191897195, + "tokens_seen": 20185088 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030654761904761905, + "loss": 5.7518, + "theoretical_loss": 6.104813189263336, + "tokens_seen": 20250624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030753968253968255, + "loss": 5.7124, + "theoretical_loss": 6.101471059803204, + "tokens_seen": 20316160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030853174603174605, + "loss": 5.6318, + "theoretical_loss": 6.098142701691856, + "tokens_seen": 20381696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00030952380952380956, + "loss": 5.7916, + "theoretical_loss": 6.094828014176053, + "tokens_seen": 20447232 + }, + { + "epoch": 0.01, + "learning_rate": 0.000310515873015873, + "loss": 5.7694, + "theoretical_loss": 6.091526897559593, + "tokens_seen": 20512768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003115079365079365, + "loss": 5.7668, + "theoretical_loss": 6.088239253188885, + "tokens_seen": 20578304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003125, + "loss": 5.5951, + "theoretical_loss": 6.084964983438763, + "tokens_seen": 20643840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003134920634920635, + "loss": 5.764, + "theoretical_loss": 6.0817039916985465, + "tokens_seen": 20709376 + }, + { + "epoch": 0.01, + "learning_rate": 0.000314484126984127, + "loss": 5.7666, + "theoretical_loss": 6.078456182358325, + "tokens_seen": 20774912 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031547619047619046, + "loss": 5.6277, + "theoretical_loss": 6.075221460795472, + "tokens_seen": 20840448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031646825396825396, + "loss": 5.5654, + "theoretical_loss": 6.071999733361386, + "tokens_seen": 20905984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031746031746031746, + "loss": 5.4487, + "theoretical_loss": 6.068790907368448, + "tokens_seen": 20971520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00031845238095238096, + "loss": 5.4991, + "theoretical_loss": 6.0655948910771915, + "tokens_seen": 21037056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003194444444444444, + "loss": 5.5788, + "theoretical_loss": 6.062411593683687, + "tokens_seen": 21102592 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032043650793650796, + "loss": 5.7443, + "theoretical_loss": 6.059240925307134, + "tokens_seen": 21168128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032142857142857147, + "loss": 5.6239, + "theoretical_loss": 6.056082796977648, + "tokens_seen": 21233664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 67698, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.764566898345947, + "objective/train/theoretical_loss": 6.052937120624258, + "objective/train/tokens_used": 41759200, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032242063492063497, + "loss": 5.6561, + "theoretical_loss": 6.052937120624258, + "tokens_seen": 21299200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003234126984126984, + "loss": 5.6063, + "theoretical_loss": 6.049803809063083, + "tokens_seen": 21364736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003244047619047619, + "loss": 5.5698, + "theoretical_loss": 6.0466827759857145, + "tokens_seen": 21430272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003253968253968254, + "loss": 5.7129, + "theoretical_loss": 6.04357393594778, + "tokens_seen": 21495808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003263888888888889, + "loss": 5.6283, + "theoretical_loss": 6.040477204357686, + "tokens_seen": 21561344 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032738095238095237, + "loss": 5.6676, + "theoretical_loss": 6.037392497465552, + "tokens_seen": 21626880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032837301587301587, + "loss": 5.5424, + "theoretical_loss": 6.034319732352309, + "tokens_seen": 21692416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00032936507936507937, + "loss": 5.6342, + "theoretical_loss": 6.031258826918979, + "tokens_seen": 21757952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033035714285714287, + "loss": 5.4564, + "theoretical_loss": 6.0282096998761245, + "tokens_seen": 21823488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003313492063492063, + "loss": 5.6654, + "theoretical_loss": 6.025172270733464, + "tokens_seen": 21889024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003323412698412698, + "loss": 5.634, + "theoretical_loss": 6.0221464597896475, + "tokens_seen": 21954560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003333333333333333, + "loss": 5.7218, + "theoretical_loss": 6.0191321881221995, + "tokens_seen": 22020096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003343253968253968, + "loss": 5.7347, + "theoretical_loss": 6.016129377577614, + "tokens_seen": 22085632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003353174603174603, + "loss": 5.5947, + "theoretical_loss": 6.01313795076161, + "tokens_seen": 22151168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003363095238095238, + "loss": 5.6702, + "theoretical_loss": 6.010157831029533, + "tokens_seen": 22216704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033730158730158733, + "loss": 5.6471, + "theoretical_loss": 6.007188942476907, + "tokens_seen": 22282240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033829365079365083, + "loss": 5.5923, + "theoretical_loss": 6.0042312099301425, + "tokens_seen": 22347776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00033928571428571433, + "loss": 5.4866, + "theoretical_loss": 6.001284558937368, + "tokens_seen": 22413312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003402777777777778, + "loss": 5.5051, + "theoretical_loss": 5.998348915759426, + "tokens_seen": 22478848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003412698412698413, + "loss": 5.5981, + "theoretical_loss": 5.995424207360987, + "tokens_seen": 22544384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003422619047619048, + "loss": 5.6425, + "theoretical_loss": 5.992510361401818, + "tokens_seen": 22609920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003432539682539683, + "loss": 5.5758, + "theoretical_loss": 5.989607306228168, + "tokens_seen": 22675456 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034424603174603173, + "loss": 5.5342, + "theoretical_loss": 5.986714970864292, + "tokens_seen": 22740992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034523809523809523, + "loss": 5.6725, + "theoretical_loss": 5.983833285004112, + "tokens_seen": 22806528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034623015873015873, + "loss": 5.5913, + "theoretical_loss": 5.980962179002983, + "tokens_seen": 22872064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 70317, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.814751148223877, + "objective/train/theoretical_loss": 5.978101583869607, + "objective/train/tokens_used": 43397600, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034722222222222224, + "loss": 5.7569, + "theoretical_loss": 5.978101583869607, + "tokens_seen": 22937600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003482142857142857, + "loss": 5.7987, + "theoretical_loss": 5.975251431258057, + "tokens_seen": 23003136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003492063492063492, + "loss": 5.2893, + "theoretical_loss": 5.972411653459913, + "tokens_seen": 23068672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003501984126984127, + "loss": 5.6931, + "theoretical_loss": 5.9695821833965335, + "tokens_seen": 23134208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003511904761904762, + "loss": 5.6142, + "theoretical_loss": 5.966762954611432, + "tokens_seen": 23199744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003521825396825397, + "loss": 5.6498, + "theoretical_loss": 5.963953901262764, + "tokens_seen": 23265280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003531746031746032, + "loss": 5.3034, + "theoretical_loss": 5.961154958115937, + "tokens_seen": 23330816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003541666666666667, + "loss": 5.4923, + "theoretical_loss": 5.958366060536315, + "tokens_seen": 23396352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003551587301587302, + "loss": 5.7133, + "theoretical_loss": 5.955587144482044, + "tokens_seen": 23461888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003561507936507937, + "loss": 5.4702, + "theoretical_loss": 5.952818146496978, + "tokens_seen": 23527424 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035714285714285714, + "loss": 5.4197, + "theoretical_loss": 5.950059003703704, + "tokens_seen": 23592960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035813492063492064, + "loss": 5.4804, + "theoretical_loss": 5.94730965379668, + "tokens_seen": 23658496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00035912698412698415, + "loss": 5.3389, + "theoretical_loss": 5.944570035035458, + "tokens_seen": 23724032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036011904761904765, + "loss": 5.4533, + "theoretical_loss": 5.941840086238027, + "tokens_seen": 23789568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003611111111111111, + "loss": 5.5779, + "theoretical_loss": 5.939119746774228, + "tokens_seen": 23855104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003621031746031746, + "loss": 5.5879, + "theoretical_loss": 5.936408956559284, + "tokens_seen": 23920640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003630952380952381, + "loss": 5.6307, + "theoretical_loss": 5.933707656047414, + "tokens_seen": 23986176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003640873015873016, + "loss": 5.3937, + "theoretical_loss": 5.93101578622554, + "tokens_seen": 24051712 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036507936507936505, + "loss": 5.6046, + "theoretical_loss": 5.928333288607086, + "tokens_seen": 24117248 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036607142857142855, + "loss": 5.3751, + "theoretical_loss": 5.925660105225867, + "tokens_seen": 24182784 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036706349206349205, + "loss": 5.6265, + "theoretical_loss": 5.92299617863006, + "tokens_seen": 24248320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003680555555555556, + "loss": 5.6131, + "theoretical_loss": 5.920341451876267, + "tokens_seen": 24313856 + }, + { + "epoch": 0.01, + "learning_rate": 0.00036904761904761905, + "loss": 5.5115, + "theoretical_loss": 5.9176958685236585, + "tokens_seen": 24379392 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037003968253968255, + "loss": 5.5909, + "theoretical_loss": 5.9150593726282015, + "tokens_seen": 24444928 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037103174603174606, + "loss": 5.5129, + "theoretical_loss": 5.912431908736972, + "tokens_seen": 24510464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 71675, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.182568073272705, + "objective/train/theoretical_loss": 5.909813421882534, + "objective/train/tokens_used": 45036000, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037202380952380956, + "loss": 5.403, + "theoretical_loss": 5.909813421882534, + "tokens_seen": 24576000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000373015873015873, + "loss": 5.7366, + "theoretical_loss": 5.907203857577422, + "tokens_seen": 24641536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003740079365079365, + "loss": 5.578, + "theoretical_loss": 5.9046031618086765, + "tokens_seen": 24707072 + }, + { + "epoch": 0.01, + "learning_rate": 0.000375, + "loss": 5.4354, + "theoretical_loss": 5.902011281032472, + "tokens_seen": 24772608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003759920634920635, + "loss": 5.5527, + "theoretical_loss": 5.899428162168808, + "tokens_seen": 24838144 + }, + { + "epoch": 0.01, + "learning_rate": 0.000376984126984127, + "loss": 5.4374, + "theoretical_loss": 5.896853752596286, + "tokens_seen": 24903680 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037797619047619046, + "loss": 5.454, + "theoretical_loss": 5.894288000146949, + "tokens_seen": 24969216 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037896825396825396, + "loss": 5.4818, + "theoretical_loss": 5.891730853101199, + "tokens_seen": 25034752 + }, + { + "epoch": 0.01, + "learning_rate": 0.00037996031746031746, + "loss": 5.6886, + "theoretical_loss": 5.88918226018278, + "tokens_seen": 25100288 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038095238095238096, + "loss": 5.4685, + "theoretical_loss": 5.8866421705538325, + "tokens_seen": 25165824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003819444444444444, + "loss": 5.4909, + "theoretical_loss": 5.8841105338100155, + "tokens_seen": 25231360 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038293650793650797, + "loss": 5.4441, + "theoretical_loss": 5.881587299975694, + "tokens_seen": 25296896 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038392857142857147, + "loss": 5.3457, + "theoretical_loss": 5.8790724194991935, + "tokens_seen": 25362432 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038492063492063497, + "loss": 5.1502, + "theoretical_loss": 5.876565843248124, + "tokens_seen": 25427968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003859126984126984, + "loss": 5.555, + "theoretical_loss": 5.8740675225047525, + "tokens_seen": 25493504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003869047619047619, + "loss": 5.2847, + "theoretical_loss": 5.871577408961457, + "tokens_seen": 25559040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003878968253968254, + "loss": 5.1909, + "theoretical_loss": 5.869095454716231, + "tokens_seen": 25624576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003888888888888889, + "loss": 5.4806, + "theoretical_loss": 5.866621612268246, + "tokens_seen": 25690112 + }, + { + "epoch": 0.01, + "learning_rate": 0.00038988095238095237, + "loss": 5.4231, + "theoretical_loss": 5.864155834513486, + "tokens_seen": 25755648 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039087301587301587, + "loss": 5.5616, + "theoretical_loss": 5.8616980747404295, + "tokens_seen": 25821184 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039186507936507937, + "loss": 5.3517, + "theoretical_loss": 5.859248286625787, + "tokens_seen": 25886720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003928571428571429, + "loss": 5.3811, + "theoretical_loss": 5.856806424230314, + "tokens_seen": 25952256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003938492063492063, + "loss": 5.333, + "theoretical_loss": 5.854372441994654, + "tokens_seen": 26017792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003948412698412698, + "loss": 5.4267, + "theoretical_loss": 5.851946294735258, + "tokens_seen": 26083328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003958333333333333, + "loss": 5.294, + "theoretical_loss": 5.849527937640345, + "tokens_seen": 26148864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 74395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.288687705993652, + "objective/train/theoretical_loss": 5.8471173262659235, + "objective/train/tokens_used": 46674400, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003968253968253968, + "loss": 5.3511, + "theoretical_loss": 5.8471173262659235, + "tokens_seen": 26214400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003978174603174603, + "loss": 5.2969, + "theoretical_loss": 5.84471441653186, + "tokens_seen": 26279936 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039880952380952383, + "loss": 5.4034, + "theoretical_loss": 5.842319164718004, + "tokens_seen": 26345472 + }, + { + "epoch": 0.01, + "learning_rate": 0.00039980158730158733, + "loss": 5.3294, + "theoretical_loss": 5.83993152746036, + "tokens_seen": 26411008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040079365079365083, + "loss": 5.2528, + "theoretical_loss": 5.83755146174731, + "tokens_seen": 26476544 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040178571428571433, + "loss": 5.4902, + "theoretical_loss": 5.835178924915889, + "tokens_seen": 26542080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004027777777777778, + "loss": 5.1137, + "theoretical_loss": 5.832813874648102, + "tokens_seen": 26607616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004037698412698413, + "loss": 5.2685, + "theoretical_loss": 5.8304562689673, + "tokens_seen": 26673152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004047619047619048, + "loss": 5.2567, + "theoretical_loss": 5.828106066234588, + "tokens_seen": 26738688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004057539682539683, + "loss": 5.4293, + "theoretical_loss": 5.825763225145295, + "tokens_seen": 26804224 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040674603174603173, + "loss": 5.194, + "theoretical_loss": 5.823427704725473, + "tokens_seen": 26869760 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040773809523809523, + "loss": 5.2885, + "theoretical_loss": 5.82109946432846, + "tokens_seen": 26935296 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040873015873015874, + "loss": 5.2425, + "theoretical_loss": 5.818778463631473, + "tokens_seen": 27000832 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040972222222222224, + "loss": 5.2221, + "theoretical_loss": 5.816464662632243, + "tokens_seen": 27066368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004107142857142857, + "loss": 5.4004, + "theoretical_loss": 5.8141580216457065, + "tokens_seen": 27131904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004117063492063492, + "loss": 5.3422, + "theoretical_loss": 5.811858501300729, + "tokens_seen": 27197440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004126984126984127, + "loss": 5.4033, + "theoretical_loss": 5.809566062536868, + "tokens_seen": 27262976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004136904761904762, + "loss": 5.1002, + "theoretical_loss": 5.807280666601191, + "tokens_seen": 27328512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004146825396825397, + "loss": 5.4236, + "theoretical_loss": 5.805002275045111, + "tokens_seen": 27394048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004156746031746032, + "loss": 5.0099, + "theoretical_loss": 5.8027308497212875, + "tokens_seen": 27459584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004166666666666667, + "loss": 5.5006, + "theoretical_loss": 5.800466352780546, + "tokens_seen": 27525120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004176587301587302, + "loss": 5.1882, + "theoretical_loss": 5.798208746668847, + "tokens_seen": 27590656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004186507936507937, + "loss": 5.4293, + "theoretical_loss": 5.795957994124291, + "tokens_seen": 27656192 + }, + { + "epoch": 0.01, + "learning_rate": 0.00041964285714285714, + "loss": 5.3885, + "theoretical_loss": 5.7937140581741575, + "tokens_seen": 27721728 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042063492063492065, + "loss": 5.2637, + "theoretical_loss": 5.791476902131985, + "tokens_seen": 27787264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 77021, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.936613082885742, + "objective/train/theoretical_loss": 5.789246489594688, + "objective/train/tokens_used": 48312800, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042162698412698415, + "loss": 5.0853, + "theoretical_loss": 5.789246489594688, + "tokens_seen": 27852800 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042261904761904765, + "loss": 5.3841, + "theoretical_loss": 5.787022784439701, + "tokens_seen": 27918336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004236111111111111, + "loss": 5.2923, + "theoretical_loss": 5.784805750822171, + "tokens_seen": 27983872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004246031746031746, + "loss": 4.9095, + "theoretical_loss": 5.782595353172176, + "tokens_seen": 28049408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004255952380952381, + "loss": 5.1513, + "theoretical_loss": 5.780391556191977, + "tokens_seen": 28114944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004265873015873016, + "loss": 5.3153, + "theoretical_loss": 5.778194324853311, + "tokens_seen": 28180480 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042757936507936505, + "loss": 5.1938, + "theoretical_loss": 5.776003624394711, + "tokens_seen": 28246016 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042857142857142855, + "loss": 5.3685, + "theoretical_loss": 5.773819420318858, + "tokens_seen": 28311552 + }, + { + "epoch": 0.01, + "learning_rate": 0.00042956349206349205, + "loss": 5.3351, + "theoretical_loss": 5.771641678389971, + "tokens_seen": 28377088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004305555555555556, + "loss": 5.4538, + "theoretical_loss": 5.769470364631225, + "tokens_seen": 28442624 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043154761904761905, + "loss": 5.0668, + "theoretical_loss": 5.767305445322201, + "tokens_seen": 28508160 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043253968253968256, + "loss": 5.4079, + "theoretical_loss": 5.765146886996363, + "tokens_seen": 28573696 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043353174603174606, + "loss": 5.3532, + "theoretical_loss": 5.762994656438579, + "tokens_seen": 28639232 + }, + { + "epoch": 0.01, + "learning_rate": 0.00043452380952380956, + "loss": 5.0999, + "theoretical_loss": 5.760848720682651, + "tokens_seen": 28704768 + }, + { + "epoch": 0.01, + "learning_rate": 0.000435515873015873, + "loss": 5.3785, + "theoretical_loss": 5.758709047008894, + "tokens_seen": 28770304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004365079365079365, + "loss": 5.3923, + "theoretical_loss": 5.756575602941732, + "tokens_seen": 28835840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004375, + "loss": 5.2236, + "theoretical_loss": 5.75444835624733, + "tokens_seen": 28901376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004384920634920635, + "loss": 5.3027, + "theoretical_loss": 5.752327274931249, + "tokens_seen": 28966912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000439484126984127, + "loss": 5.43, + "theoretical_loss": 5.750212327236129, + "tokens_seen": 29032448 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044047619047619046, + "loss": 5.3153, + "theoretical_loss": 5.7481034816394105, + "tokens_seen": 29097984 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044146825396825396, + "loss": 5.4283, + "theoretical_loss": 5.7460007068510635, + "tokens_seen": 29163520 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044246031746031746, + "loss": 5.3277, + "theoretical_loss": 5.74390397181136, + "tokens_seen": 29229056 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044345238095238096, + "loss": 5.1985, + "theoretical_loss": 5.741813245688668, + "tokens_seen": 29294592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004444444444444444, + "loss": 5.2678, + "theoretical_loss": 5.739728497877267, + "tokens_seen": 29360128 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044543650793650797, + "loss": 5.1771, + "theoretical_loss": 5.737649697995197, + "tokens_seen": 29425664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 79798, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.1653008460998535, + "objective/train/theoretical_loss": 5.7355768158821245, + "objective/train/tokens_used": 49951200, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044642857142857147, + "loss": 5.2759, + "theoretical_loss": 5.7355768158821245, + "tokens_seen": 29491200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00044742063492063497, + "loss": 5.261, + "theoretical_loss": 5.73350982159724, + "tokens_seen": 29556736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004484126984126984, + "loss": 5.3416, + "theoretical_loss": 5.731448685417178, + "tokens_seen": 29622272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004494047619047619, + "loss": 5.2063, + "theoretical_loss": 5.729393377833956, + "tokens_seen": 29687808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004503968253968254, + "loss": 5.2898, + "theoretical_loss": 5.7273438695529535, + "tokens_seen": 29753344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004513888888888889, + "loss": 5.2954, + "theoretical_loss": 5.725300131490888, + "tokens_seen": 29818880 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045238095238095237, + "loss": 5.3282, + "theoretical_loss": 5.7232621347738455, + "tokens_seen": 29884416 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045337301587301587, + "loss": 5.2503, + "theoretical_loss": 5.721229850735305, + "tokens_seen": 29949952 + }, + { + "epoch": 0.01, + "learning_rate": 0.00045436507936507937, + "loss": 5.1738, + "theoretical_loss": 5.719203250914208, + "tokens_seen": 30015488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004553571428571429, + "loss": 5.2278, + "theoretical_loss": 5.717182307053037, + "tokens_seen": 30081024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004563492063492063, + "loss": 5.4558, + "theoretical_loss": 5.715166991095922, + "tokens_seen": 30146560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004573412698412698, + "loss": 5.3517, + "theoretical_loss": 5.713157275186761, + "tokens_seen": 30212096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004583333333333333, + "loss": 5.2566, + "theoretical_loss": 5.71115313166738, + "tokens_seen": 30277632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004593253968253968, + "loss": 5.1988, + "theoretical_loss": 5.709154533075688, + "tokens_seen": 30343168 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046031746031746033, + "loss": 5.3092, + "theoretical_loss": 5.707161452143879, + "tokens_seen": 30408704 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046130952380952383, + "loss": 5.1524, + "theoretical_loss": 5.7051738617966326, + "tokens_seen": 30474240 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046230158730158733, + "loss": 5.2729, + "theoretical_loss": 5.7031917351493515, + "tokens_seen": 30539776 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046329365079365083, + "loss": 5.2209, + "theoretical_loss": 5.701215045506411, + "tokens_seen": 30605312 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046428571428571433, + "loss": 5.3483, + "theoretical_loss": 5.699243766359421, + "tokens_seen": 30670848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004652777777777778, + "loss": 5.1607, + "theoretical_loss": 5.697277871385534, + "tokens_seen": 30736384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004662698412698413, + "loss": 5.2284, + "theoretical_loss": 5.695317334445736, + "tokens_seen": 30801920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004672619047619048, + "loss": 5.1931, + "theoretical_loss": 5.693362129583184, + "tokens_seen": 30867456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004682539682539683, + "loss": 5.2138, + "theoretical_loss": 5.691412231021549, + "tokens_seen": 30932992 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046924603174603173, + "loss": 5.197, + "theoretical_loss": 5.689467613163388, + "tokens_seen": 30998528 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047023809523809523, + "loss": 5.3185, + "theoretical_loss": 5.687528250588518, + "tokens_seen": 31064064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 82602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.185832977294922, + "objective/train/theoretical_loss": 5.6855941180524265, + "objective/train/tokens_used": 51589600, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047123015873015874, + "loss": 5.1136, + "theoretical_loss": 5.6855941180524265, + "tokens_seen": 31129600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00047222222222222224, + "loss": 5.2111, + "theoretical_loss": 5.683665190484683, + "tokens_seen": 31195136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004732142857142857, + "loss": 5.1742, + "theoretical_loss": 5.681741442987381, + "tokens_seen": 31260672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004742063492063492, + "loss": 5.0943, + "theoretical_loss": 5.679822850833591, + "tokens_seen": 31326208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004751984126984127, + "loss": 5.1275, + "theoretical_loss": 5.677909389465831, + "tokens_seen": 31391744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004761904761904762, + "loss": 5.0673, + "theoretical_loss": 5.676001034494554, + "tokens_seen": 31457280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004771825396825397, + "loss": 5.1805, + "theoretical_loss": 5.674097761696653, + "tokens_seen": 31522816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004781746031746032, + "loss": 5.0702, + "theoretical_loss": 5.672199547013983, + "tokens_seen": 31588352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004791666666666667, + "loss": 5.1862, + "theoretical_loss": 5.670306366551898, + "tokens_seen": 31653888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004801587301587302, + "loss": 5.297, + "theoretical_loss": 5.6684181965778, + "tokens_seen": 31719424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004811507936507937, + "loss": 5.3808, + "theoretical_loss": 5.666535013519715, + "tokens_seen": 31784960 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048214285714285715, + "loss": 5.2057, + "theoretical_loss": 5.6646567939648715, + "tokens_seen": 31850496 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048313492063492065, + "loss": 5.3193, + "theoretical_loss": 5.6627835146583045, + "tokens_seen": 31916032 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048412698412698415, + "loss": 5.22, + "theoretical_loss": 5.660915152501465, + "tokens_seen": 31981568 + }, + { + "epoch": 0.01, + "learning_rate": 0.00048511904761904765, + "loss": 5.2162, + "theoretical_loss": 5.659051684550857, + "tokens_seen": 32047104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004861111111111111, + "loss": 5.1626, + "theoretical_loss": 5.657193088016677, + "tokens_seen": 32112640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004871031746031746, + "loss": 5.1728, + "theoretical_loss": 5.655339340261474, + "tokens_seen": 32178176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004880952380952381, + "loss": 5.1504, + "theoretical_loss": 5.653490418798825, + "tokens_seen": 32243712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004890873015873016, + "loss": 5.335, + "theoretical_loss": 5.651646301292022, + "tokens_seen": 32309248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004900793650793651, + "loss": 5.2474, + "theoretical_loss": 5.649806965552774, + "tokens_seen": 32374784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004910714285714286, + "loss": 5.163, + "theoretical_loss": 5.6479723895399205, + "tokens_seen": 32440320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000492063492063492, + "loss": 5.0628, + "theoretical_loss": 5.6461425513581665, + "tokens_seen": 32505856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004930555555555556, + "loss": 5.1941, + "theoretical_loss": 5.6443174292568195, + "tokens_seen": 32571392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004940476190476191, + "loss": 5.085, + "theoretical_loss": 5.6424970016285485, + "tokens_seen": 32636928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004950396825396826, + "loss": 5.1651, + "theoretical_loss": 5.640681247008156, + "tokens_seen": 32702464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 84200, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.8913254737854, + "objective/train/theoretical_loss": 5.638870144071353, + "objective/train/tokens_used": 53228000, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.000496031746031746, + "loss": 5.0578, + "theoretical_loss": 5.638870144071353, + "tokens_seen": 32768000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004970238095238095, + "loss": 5.1256, + "theoretical_loss": 5.637063671633564, + "tokens_seen": 32833536 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498015873015873, + "loss": 5.254, + "theoretical_loss": 5.635261808648728, + "tokens_seen": 32899072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990079365079365, + "loss": 5.0475, + "theoretical_loss": 5.6334645342081195, + "tokens_seen": 32964608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 5.2743, + "theoretical_loss": 5.631671827539186, + "tokens_seen": 33030144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999899699097292, + "loss": 5.1251, + "theoretical_loss": 5.629883668004389, + "tokens_seen": 33095680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999799398194584, + "loss": 5.4308, + "theoretical_loss": 5.628100035100061, + "tokens_seen": 33161216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999699097291876, + "loss": 5.0668, + "theoretical_loss": 5.626320908455279, + "tokens_seen": 33226752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999598796389167, + "loss": 5.1958, + "theoretical_loss": 5.6245462678307385, + "tokens_seen": 33292288 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499949849548646, + "loss": 5.1531, + "theoretical_loss": 5.622776093117652, + "tokens_seen": 33357824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999398194583751, + "loss": 5.0803, + "theoretical_loss": 5.621010364336651, + "tokens_seen": 33423360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999297893681044, + "loss": 5.1534, + "theoretical_loss": 5.619249061636698, + "tokens_seen": 33488896 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999197592778335, + "loss": 5.1534, + "theoretical_loss": 5.61749216529402, + "tokens_seen": 33554432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004999097291875627, + "loss": 5.0052, + "theoretical_loss": 5.615739655711037, + "tokens_seen": 33619968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998996990972919, + "loss": 4.9935, + "theoretical_loss": 5.61399151341532, + "tokens_seen": 33685504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998896690070211, + "loss": 4.6181, + "theoretical_loss": 5.6122477190585425, + "tokens_seen": 33751040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998796389167503, + "loss": 4.8924, + "theoretical_loss": 5.610508253415453, + "tokens_seen": 33816576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998696088264795, + "loss": 5.1347, + "theoretical_loss": 5.6087730973828585, + "tokens_seen": 33882112 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998595787362087, + "loss": 5.1395, + "theoretical_loss": 5.6070422319786095, + "tokens_seen": 33947648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998495486459378, + "loss": 4.8954, + "theoretical_loss": 5.605315638340606, + "tokens_seen": 34013184 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499839518555667, + "loss": 5.2082, + "theoretical_loss": 5.603593297725807, + "tokens_seen": 34078720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998294884653962, + "loss": 5.1833, + "theoretical_loss": 5.601875191509249, + "tokens_seen": 34144256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998194583751254, + "loss": 5.2404, + "theoretical_loss": 5.600161301183084, + "tokens_seen": 34209792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998094282848546, + "loss": 5.0395, + "theoretical_loss": 5.598451608355614, + "tokens_seen": 34275328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997993981945837, + "loss": 5.2058, + "theoretical_loss": 5.596746094750342, + "tokens_seen": 34340864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 86919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.993339538574219, + "objective/train/theoretical_loss": 5.595044742205037, + "objective/train/tokens_used": 54866400, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997893681043129, + "loss": 5.1211, + "theoretical_loss": 5.595044742205037, + "tokens_seen": 34406400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997793380140421, + "loss": 5.122, + "theoretical_loss": 5.5933475326707995, + "tokens_seen": 34471936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997693079237714, + "loss": 4.9179, + "theoretical_loss": 5.591654448211143, + "tokens_seen": 34537472 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997592778335005, + "loss": 5.1214, + "theoretical_loss": 5.589965471001077, + "tokens_seen": 34603008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997492477432298, + "loss": 4.9931, + "theoretical_loss": 5.5882805833262115, + "tokens_seen": 34668544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997392176529588, + "loss": 5.0034, + "theoretical_loss": 5.586599767581859, + "tokens_seen": 34734080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997291875626881, + "loss": 5.181, + "theoretical_loss": 5.584923006272151, + "tokens_seen": 34799616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997191574724173, + "loss": 5.0833, + "theoretical_loss": 5.583250282009159, + "tokens_seen": 34865152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997091273821465, + "loss": 5.0421, + "theoretical_loss": 5.581581577512031, + "tokens_seen": 34930688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996990972918757, + "loss": 5.0142, + "theoretical_loss": 5.579916875606134, + "tokens_seen": 34996224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996890672016048, + "loss": 5.0154, + "theoretical_loss": 5.578256159222196, + "tokens_seen": 35061760 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499679037111334, + "loss": 4.939, + "theoretical_loss": 5.576599411395472, + "tokens_seen": 35127296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996690070210632, + "loss": 5.0111, + "theoretical_loss": 5.574946615264906, + "tokens_seen": 35192832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996589769307924, + "loss": 4.7694, + "theoretical_loss": 5.5732977540723105, + "tokens_seen": 35258368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996489468405216, + "loss": 4.9303, + "theoretical_loss": 5.571652811161542, + "tokens_seen": 35323904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996389167502507, + "loss": 5.0498, + "theoretical_loss": 5.570011769977693, + "tokens_seen": 35389440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996288866599799, + "loss": 5.1185, + "theoretical_loss": 5.568374614066299, + "tokens_seen": 35454976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996188565697091, + "loss": 5.1103, + "theoretical_loss": 5.566741327072535, + "tokens_seen": 35520512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996088264794383, + "loss": 4.9533, + "theoretical_loss": 5.565111892740433, + "tokens_seen": 35586048 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995987963891675, + "loss": 5.002, + "theoretical_loss": 5.563486294912105, + "tokens_seen": 35651584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995887662988968, + "loss": 5.1338, + "theoretical_loss": 5.56186451752697, + "tokens_seen": 35717120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995787362086258, + "loss": 4.9329, + "theoretical_loss": 5.560246544620993, + "tokens_seen": 35782656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995687061183551, + "loss": 5.0387, + "theoretical_loss": 5.558632360325929, + "tokens_seen": 35848192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995586760280842, + "loss": 5.1425, + "theoretical_loss": 5.557021948868571, + "tokens_seen": 35913728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995486459378135, + "loss": 4.7661, + "theoretical_loss": 5.555415294570011, + "tokens_seen": 35979264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 89797, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.706801891326904, + "objective/train/theoretical_loss": 5.553812381844907, + "objective/train/tokens_used": 56504800, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995386158475427, + "loss": 4.9598, + "theoretical_loss": 5.553812381844907, + "tokens_seen": 36044800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995285857572718, + "loss": 4.9648, + "theoretical_loss": 5.552213195200755, + "tokens_seen": 36110336 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499518555667001, + "loss": 5.2254, + "theoretical_loss": 5.550617719237167, + "tokens_seen": 36175872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995085255767302, + "loss": 5.0886, + "theoretical_loss": 5.549025938645155, + "tokens_seen": 36241408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994984954864594, + "loss": 4.9808, + "theoretical_loss": 5.547437838206435, + "tokens_seen": 36306944 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994884653961886, + "loss": 4.9007, + "theoretical_loss": 5.545853402792717, + "tokens_seen": 36372480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994784353059178, + "loss": 4.9751, + "theoretical_loss": 5.544272617365014, + "tokens_seen": 36438016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994684052156469, + "loss": 5.1435, + "theoretical_loss": 5.542695466972956, + "tokens_seen": 36503552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994583751253761, + "loss": 5.0653, + "theoretical_loss": 5.541121936754111, + "tokens_seen": 36569088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994483450351053, + "loss": 5.0653, + "theoretical_loss": 5.539552011933312, + "tokens_seen": 36634624 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994383149448345, + "loss": 5.0246, + "theoretical_loss": 5.537985677821986, + "tokens_seen": 36700160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994282848545637, + "loss": 4.9959, + "theoretical_loss": 5.536422919817495, + "tokens_seen": 36765696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994182547642928, + "loss": 5.2206, + "theoretical_loss": 5.5348637234024824, + "tokens_seen": 36831232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994082246740221, + "loss": 5.1661, + "theoretical_loss": 5.53330807414422, + "tokens_seen": 36896768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993981945837512, + "loss": 4.9486, + "theoretical_loss": 5.5317559576939725, + "tokens_seen": 36962304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993881644934805, + "loss": 4.9906, + "theoretical_loss": 5.530207359786353, + "tokens_seen": 37027840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993781344032096, + "loss": 5.0665, + "theoretical_loss": 5.5286622662386975, + "tokens_seen": 37093376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993681043129389, + "loss": 4.923, + "theoretical_loss": 5.52712066295044, + "tokens_seen": 37158912 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499358074222668, + "loss": 5.0928, + "theoretical_loss": 5.525582535902489, + "tokens_seen": 37224448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993480441323972, + "loss": 5.001, + "theoretical_loss": 5.524047871156618, + "tokens_seen": 37289984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993380140421264, + "loss": 4.7758, + "theoretical_loss": 5.52251665485486, + "tokens_seen": 37355520 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993279839518556, + "loss": 4.9134, + "theoretical_loss": 5.520988873218897, + "tokens_seen": 37421056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993179538615848, + "loss": 4.9349, + "theoretical_loss": 5.519464512549478, + "tokens_seen": 37486592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993079237713139, + "loss": 4.6366, + "theoretical_loss": 5.5179435592258095, + "tokens_seen": 37552128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992978936810431, + "loss": 4.9398, + "theoretical_loss": 5.516425999704987, + "tokens_seen": 37617664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 92627, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.293185710906982, + "objective/train/theoretical_loss": 5.514911820521407, + "objective/train/tokens_used": 58143200, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992878635907723, + "loss": 5.0295, + "theoretical_loss": 5.514911820521407, + "tokens_seen": 37683200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992778335005015, + "loss": 5.0782, + "theoretical_loss": 5.5134010082861895, + "tokens_seen": 37748736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992678034102307, + "loss": 5.0081, + "theoretical_loss": 5.511893549686616, + "tokens_seen": 37814272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992577733199598, + "loss": 4.9365, + "theoretical_loss": 5.51038943148556, + "tokens_seen": 37879808 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499247743229689, + "loss": 4.8296, + "theoretical_loss": 5.508888640520928, + "tokens_seen": 37945344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992377131394183, + "loss": 4.9377, + "theoretical_loss": 5.50739116370511, + "tokens_seen": 38010880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992276830491475, + "loss": 4.9181, + "theoretical_loss": 5.505896988024423, + "tokens_seen": 38076416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992176529588767, + "loss": 5.0056, + "theoretical_loss": 5.5044061005385725, + "tokens_seen": 38141952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004992076228686059, + "loss": 4.8272, + "theoretical_loss": 5.502918488380116, + "tokens_seen": 38207488 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499197592778335, + "loss": 4.9873, + "theoretical_loss": 5.501434138753918, + "tokens_seen": 38273024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991875626880642, + "loss": 4.9004, + "theoretical_loss": 5.499953038936635, + "tokens_seen": 38338560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991775325977934, + "loss": 5.0152, + "theoretical_loss": 5.498475176276176, + "tokens_seen": 38404096 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991675025075226, + "loss": 4.8724, + "theoretical_loss": 5.497000538191195, + "tokens_seen": 38469632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991574724172518, + "loss": 5.0293, + "theoretical_loss": 5.495529112170568, + "tokens_seen": 38535168 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499147442326981, + "loss": 4.9326, + "theoretical_loss": 5.494060885772887, + "tokens_seen": 38600704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991374122367101, + "loss": 4.875, + "theoretical_loss": 5.492595846625951, + "tokens_seen": 38666240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991273821464393, + "loss": 4.8233, + "theoretical_loss": 5.491133982426266, + "tokens_seen": 38731776 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991173520561685, + "loss": 5.0339, + "theoretical_loss": 5.489675280938547, + "tokens_seen": 38797312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991073219658977, + "loss": 5.0537, + "theoretical_loss": 5.488219729995227, + "tokens_seen": 38862848 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499097291875627, + "loss": 4.9487, + "theoretical_loss": 5.486767317495966, + "tokens_seen": 38928384 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499087261785356, + "loss": 4.8735, + "theoretical_loss": 5.48531803140717, + "tokens_seen": 38993920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990772316950853, + "loss": 4.9125, + "theoretical_loss": 5.483871859761511, + "tokens_seen": 39059456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990672016048144, + "loss": 4.8104, + "theoretical_loss": 5.482428790657449, + "tokens_seen": 39124992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990571715145437, + "loss": 4.8264, + "theoretical_loss": 5.480988812258763, + "tokens_seen": 39190528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990471414242729, + "loss": 4.834, + "theoretical_loss": 5.479551912794086, + "tokens_seen": 39256064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 95351, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 5.164060115814209, + "objective/train/theoretical_loss": 5.478118080556438, + "objective/train/tokens_used": 59781600, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499037111334002, + "loss": 5.0383, + "theoretical_loss": 5.478118080556438, + "tokens_seen": 39321600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990270812437312, + "loss": 5.0031, + "theoretical_loss": 5.476687303902768, + "tokens_seen": 39387136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990170511534604, + "loss": 4.9052, + "theoretical_loss": 5.475259571253502, + "tokens_seen": 39452672 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990070210631896, + "loss": 4.7047, + "theoretical_loss": 5.473834871092089, + "tokens_seen": 39518208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989969909729188, + "loss": 4.7929, + "theoretical_loss": 5.4724131919645576, + "tokens_seen": 39583744 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498986960882648, + "loss": 4.9548, + "theoretical_loss": 5.470994522479069, + "tokens_seen": 39649280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989769307923771, + "loss": 4.9846, + "theoretical_loss": 5.4695788513054815, + "tokens_seen": 39714816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989669007021063, + "loss": 4.8696, + "theoretical_loss": 5.468166167174912, + "tokens_seen": 39780352 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989568706118355, + "loss": 4.8161, + "theoretical_loss": 5.466756458879306, + "tokens_seen": 39845888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989468405215647, + "loss": 4.6975, + "theoretical_loss": 5.465349715271013, + "tokens_seen": 39911424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989368104312939, + "loss": 4.8749, + "theoretical_loss": 5.463945925262355, + "tokens_seen": 39976960 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498926780341023, + "loss": 4.7834, + "theoretical_loss": 5.462545077825214, + "tokens_seen": 40042496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989167502507523, + "loss": 4.9832, + "theoretical_loss": 5.461147161990611, + "tokens_seen": 40108032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989067201604814, + "loss": 4.8644, + "theoretical_loss": 5.459752166848292, + "tokens_seen": 40173568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988966900702107, + "loss": 4.7752, + "theoretical_loss": 5.458360081546321, + "tokens_seen": 40239104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988866599799398, + "loss": 5.0494, + "theoretical_loss": 5.456970895290674, + "tokens_seen": 40304640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988766298896691, + "loss": 4.9456, + "theoretical_loss": 5.455584597344835, + "tokens_seen": 40370176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988665997993982, + "loss": 5.0286, + "theoretical_loss": 5.454201177029395, + "tokens_seen": 40435712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988565697091274, + "loss": 4.9307, + "theoretical_loss": 5.452820623721662, + "tokens_seen": 40501248 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988465396188566, + "loss": 4.7727, + "theoretical_loss": 5.45144292685526, + "tokens_seen": 40566784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988365095285858, + "loss": 4.8423, + "theoretical_loss": 5.450068075919752, + "tokens_seen": 40632320 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498826479438315, + "loss": 4.8619, + "theoretical_loss": 5.44869606046024, + "tokens_seen": 40697856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988164493480441, + "loss": 4.79, + "theoretical_loss": 5.447326870076996, + "tokens_seen": 40763392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988064192577733, + "loss": 4.7217, + "theoretical_loss": 5.445960494425072, + "tokens_seen": 40828928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987963891675025, + "loss": 4.6799, + "theoretical_loss": 5.444596923213931, + "tokens_seen": 40894464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 97874, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.858269691467285, + "objective/train/theoretical_loss": 5.443236146207074, + "objective/train/tokens_used": 61420000, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987863590772317, + "loss": 4.8036, + "theoretical_loss": 5.443236146207074, + "tokens_seen": 40960000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987763289869609, + "loss": 4.7825, + "theoretical_loss": 5.441878153221662, + "tokens_seen": 41025536 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049876629889669, + "loss": 4.8481, + "theoretical_loss": 5.440522934128164, + "tokens_seen": 41091072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987562688064192, + "loss": 4.8502, + "theoretical_loss": 5.439170478849976, + "tokens_seen": 41156608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987462387161484, + "loss": 4.7447, + "theoretical_loss": 5.437820777363078, + "tokens_seen": 41222144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987362086258777, + "loss": 4.8936, + "theoretical_loss": 5.4364738196956655, + "tokens_seen": 41287680 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987261785356068, + "loss": 4.7766, + "theoretical_loss": 5.435129595927794, + "tokens_seen": 41353216 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987161484453361, + "loss": 4.7729, + "theoretical_loss": 5.433788096191039, + "tokens_seen": 41418752 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987061183550651, + "loss": 4.8357, + "theoretical_loss": 5.432449310668134, + "tokens_seen": 41484288 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986960882647944, + "loss": 4.9716, + "theoretical_loss": 5.4311132295926345, + "tokens_seen": 41549824 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986860581745236, + "loss": 4.8144, + "theoretical_loss": 5.42977984324857, + "tokens_seen": 41615360 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986760280842528, + "loss": 4.7498, + "theoretical_loss": 5.428449141970107, + "tokens_seen": 41680896 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498665997993982, + "loss": 4.7787, + "theoretical_loss": 5.427121116141212, + "tokens_seen": 41746432 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986559679037111, + "loss": 4.7882, + "theoretical_loss": 5.42579575619531, + "tokens_seen": 41811968 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986459378134403, + "loss": 4.7111, + "theoretical_loss": 5.424473052614967, + "tokens_seen": 41877504 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986359077231695, + "loss": 4.6959, + "theoretical_loss": 5.423152995931552, + "tokens_seen": 41943040 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986258776328987, + "loss": 4.8223, + "theoretical_loss": 5.421835576724906, + "tokens_seen": 42008576 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986158475426279, + "loss": 4.926, + "theoretical_loss": 5.420520785623031, + "tokens_seen": 42074112 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498605817452357, + "loss": 4.73, + "theoretical_loss": 5.4192086133017625, + "tokens_seen": 42139648 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985957873620862, + "loss": 4.7094, + "theoretical_loss": 5.417899050484451, + "tokens_seen": 42205184 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985857572718154, + "loss": 4.7984, + "theoretical_loss": 5.416592087941646, + "tokens_seen": 42270720 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985757271815446, + "loss": 4.7134, + "theoretical_loss": 5.415287716490787, + "tokens_seen": 42336256 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985656970912738, + "loss": 4.698, + "theoretical_loss": 5.413985926995892, + "tokens_seen": 42401792 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985556670010031, + "loss": 4.6165, + "theoretical_loss": 5.412686710367245, + "tokens_seen": 42467328 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985456369107321, + "loss": 4.6415, + "theoretical_loss": 5.411390057561097, + "tokens_seen": 42532864 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 100753, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.590413570404053, + "objective/train/theoretical_loss": 5.410095959579362, + "objective/train/tokens_used": 63058400, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985356068204614, + "loss": 4.6407, + "theoretical_loss": 5.410095959579362, + "tokens_seen": 42598400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985255767301905, + "loss": 4.7929, + "theoretical_loss": 5.408804407469308, + "tokens_seen": 42663936 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985155466399198, + "loss": 4.8958, + "theoretical_loss": 5.407515392323276, + "tokens_seen": 42729472 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498505516549649, + "loss": 4.9517, + "theoretical_loss": 5.406228905278368, + "tokens_seen": 42795008 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984954864593782, + "loss": 4.6586, + "theoretical_loss": 5.404944937516161, + "tokens_seen": 42860544 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984854563691073, + "loss": 4.7779, + "theoretical_loss": 5.403663480262418, + "tokens_seen": 42926080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984754262788365, + "loss": 4.7068, + "theoretical_loss": 5.402384524786797, + "tokens_seen": 42991616 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984653961885657, + "loss": 4.6546, + "theoretical_loss": 5.401108062402562, + "tokens_seen": 43057152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984553660982949, + "loss": 4.6977, + "theoretical_loss": 5.399834084466306, + "tokens_seen": 43122688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984453360080241, + "loss": 4.61, + "theoretical_loss": 5.398562582377666, + "tokens_seen": 43188224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984353059177532, + "loss": 4.5738, + "theoretical_loss": 5.397293547579041, + "tokens_seen": 43253760 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984252758274825, + "loss": 4.9075, + "theoretical_loss": 5.396026971555319, + "tokens_seen": 43319296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984152457372116, + "loss": 4.9727, + "theoretical_loss": 5.394762845833601, + "tokens_seen": 43384832 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004984052156469409, + "loss": 4.7829, + "theoretical_loss": 5.393501161982926, + "tokens_seen": 43450368 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049839518555667, + "loss": 4.7414, + "theoretical_loss": 5.392241911614005, + "tokens_seen": 43515904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983851554663993, + "loss": 4.781, + "theoretical_loss": 5.390985086378949, + "tokens_seen": 43581440 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983751253761284, + "loss": 4.7992, + "theoretical_loss": 5.389730677971002, + "tokens_seen": 43646976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983650952858576, + "loss": 4.6478, + "theoretical_loss": 5.388478678124285, + "tokens_seen": 43712512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983550651955868, + "loss": 4.811, + "theoretical_loss": 5.387229078613521, + "tokens_seen": 43778048 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498345035105316, + "loss": 4.7864, + "theoretical_loss": 5.385981871253785, + "tokens_seen": 43843584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983350050150452, + "loss": 4.7484, + "theoretical_loss": 5.384737047900243, + "tokens_seen": 43909120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983249749247743, + "loss": 4.7651, + "theoretical_loss": 5.3834946004478965, + "tokens_seen": 43974656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983149448345035, + "loss": 4.8328, + "theoretical_loss": 5.382254520831328, + "tokens_seen": 44040192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983049147442327, + "loss": 4.879, + "theoretical_loss": 5.381016801024449, + "tokens_seen": 44105728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982948846539619, + "loss": 4.6612, + "theoretical_loss": 5.379781433040252, + "tokens_seen": 44171264 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 102279, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.958780288696289, + "objective/train/theoretical_loss": 5.378548408930558, + "objective/train/tokens_used": 64696800, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982848545636911, + "loss": 4.8791, + "theoretical_loss": 5.378548408930558, + "tokens_seen": 44236800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982748244734202, + "loss": 4.8515, + "theoretical_loss": 5.377317720785777, + "tokens_seen": 44302336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982647943831494, + "loss": 4.7675, + "theoretical_loss": 5.37608936073466, + "tokens_seen": 44367872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982547642928786, + "loss": 4.8791, + "theoretical_loss": 5.374863320944057, + "tokens_seen": 44433408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982447342026079, + "loss": 4.7669, + "theoretical_loss": 5.373639593618675, + "tokens_seen": 44498944 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498234704112337, + "loss": 4.7242, + "theoretical_loss": 5.372418171000847, + "tokens_seen": 44564480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982246740220663, + "loss": 4.8156, + "theoretical_loss": 5.371199045370283, + "tokens_seen": 44630016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982146439317953, + "loss": 4.8198, + "theoretical_loss": 5.369982209043851, + "tokens_seen": 44695552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982046138415246, + "loss": 4.7153, + "theoretical_loss": 5.368767654375327, + "tokens_seen": 44761088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981945837512538, + "loss": 4.767, + "theoretical_loss": 5.367555373755179, + "tokens_seen": 44826624 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498184553660983, + "loss": 4.8224, + "theoretical_loss": 5.366345359610327, + "tokens_seen": 44892160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981745235707122, + "loss": 4.645, + "theoretical_loss": 5.365137604403923, + "tokens_seen": 44957696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981644934804413, + "loss": 4.7519, + "theoretical_loss": 5.363932100635117, + "tokens_seen": 45023232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981544633901705, + "loss": 4.6968, + "theoretical_loss": 5.362728840838843, + "tokens_seen": 45088768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981444332998997, + "loss": 4.854, + "theoretical_loss": 5.361527817585586, + "tokens_seen": 45154304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981344032096289, + "loss": 4.7136, + "theoretical_loss": 5.360329023481169, + "tokens_seen": 45219840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981243731193581, + "loss": 4.8594, + "theoretical_loss": 5.359132451166534, + "tokens_seen": 45285376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981143430290873, + "loss": 4.6249, + "theoretical_loss": 5.357938093317518, + "tokens_seen": 45350912 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981043129388164, + "loss": 4.7844, + "theoretical_loss": 5.356745942644645, + "tokens_seen": 45416448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980942828485456, + "loss": 4.6213, + "theoretical_loss": 5.355555991892905, + "tokens_seen": 45481984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980842527582748, + "loss": 4.7597, + "theoretical_loss": 5.35436823384155, + "tokens_seen": 45547520 + }, + { + "epoch": 0.01, + "learning_rate": 0.000498074222668004, + "loss": 4.6933, + "theoretical_loss": 5.353182661303873, + "tokens_seen": 45613056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980641925777333, + "loss": 4.5539, + "theoretical_loss": 5.35199926712701, + "tokens_seen": 45678592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980541624874623, + "loss": 4.7203, + "theoretical_loss": 5.350818044191721, + "tokens_seen": 45744128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980441323971916, + "loss": 4.9096, + "theoretical_loss": 5.349638985412193, + "tokens_seen": 45809664 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 105336, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.5851826667785645, + "objective/train/theoretical_loss": 5.348462083735834, + "objective/train/tokens_used": 66335200, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980341023069207, + "loss": 4.6791, + "theoretical_loss": 5.348462083735834, + "tokens_seen": 45875200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00049802407221665, + "loss": 4.6513, + "theoretical_loss": 5.347287332143064, + "tokens_seen": 45940736 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980140421263792, + "loss": 4.7431, + "theoretical_loss": 5.346114723647119, + "tokens_seen": 46006272 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980040120361084, + "loss": 4.7977, + "theoretical_loss": 5.344944251293852, + "tokens_seen": 46071808 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979939819458375, + "loss": 4.6007, + "theoretical_loss": 5.343775908161532, + "tokens_seen": 46137344 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979839518555667, + "loss": 4.8353, + "theoretical_loss": 5.342609687360644, + "tokens_seen": 46202880 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979739217652959, + "loss": 4.7883, + "theoretical_loss": 5.341445582033705, + "tokens_seen": 46268416 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979638916750251, + "loss": 4.7979, + "theoretical_loss": 5.3402835853550545, + "tokens_seen": 46333952 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979538615847543, + "loss": 4.5668, + "theoretical_loss": 5.339123690530673, + "tokens_seen": 46399488 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979438314944834, + "loss": 4.8504, + "theoretical_loss": 5.337965890797989, + "tokens_seen": 46465024 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979338014042126, + "loss": 4.7123, + "theoretical_loss": 5.336810179425685, + "tokens_seen": 46530560 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979237713139418, + "loss": 4.7681, + "theoretical_loss": 5.335656549713516, + "tokens_seen": 46596096 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497913741223671, + "loss": 4.5716, + "theoretical_loss": 5.334504994992115, + "tokens_seen": 46661632 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979037111334002, + "loss": 4.6395, + "theoretical_loss": 5.333355508622814, + "tokens_seen": 46727168 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978936810431293, + "loss": 4.5571, + "theoretical_loss": 5.332208083997459, + "tokens_seen": 46792704 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978836509528586, + "loss": 4.5376, + "theoretical_loss": 5.33106271453822, + "tokens_seen": 46858240 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978736208625877, + "loss": 4.7635, + "theoretical_loss": 5.329919393697422, + "tokens_seen": 46923776 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497863590772317, + "loss": 4.6811, + "theoretical_loss": 5.328778114957351, + "tokens_seen": 46989312 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978535606820461, + "loss": 4.6086, + "theoretical_loss": 5.327638871830089, + "tokens_seen": 47054848 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978435305917754, + "loss": 4.7103, + "theoretical_loss": 5.326501657857326, + "tokens_seen": 47120384 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978335005015045, + "loss": 4.5543, + "theoretical_loss": 5.32536646661019, + "tokens_seen": 47185920 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978234704112337, + "loss": 4.6051, + "theoretical_loss": 5.324233291689069, + "tokens_seen": 47251456 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978134403209629, + "loss": 4.8205, + "theoretical_loss": 5.323102126723439, + "tokens_seen": 47316992 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978034102306921, + "loss": 4.7387, + "theoretical_loss": 5.321972965371691, + "tokens_seen": 47382528 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977933801404213, + "loss": 4.6653, + "theoretical_loss": 5.320845801320959, + "tokens_seen": 47448064 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 108007, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.374704837799072, + "objective/train/theoretical_loss": 5.319720628286955, + "objective/train/tokens_used": 67973600, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977833500501504, + "loss": 4.6795, + "theoretical_loss": 5.319720628286955, + "tokens_seen": 47513600 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977733199598796, + "loss": 4.5909, + "theoretical_loss": 5.318597440013795, + "tokens_seen": 47579136 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977632898696088, + "loss": 4.3845, + "theoretical_loss": 5.317476230273831, + "tokens_seen": 47644672 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497753259779338, + "loss": 4.5953, + "theoretical_loss": 5.316356992867491, + "tokens_seen": 47710208 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977432296890672, + "loss": 4.5629, + "theoretical_loss": 5.31523972162311, + "tokens_seen": 47775744 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977331995987965, + "loss": 4.6185, + "theoretical_loss": 5.314124410396767, + "tokens_seen": 47841280 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977231695085255, + "loss": 4.646, + "theoretical_loss": 5.31301105307212, + "tokens_seen": 47906816 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977131394182548, + "loss": 4.7214, + "theoretical_loss": 5.311899643560251, + "tokens_seen": 47972352 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497703109327984, + "loss": 4.6251, + "theoretical_loss": 5.310790175799497, + "tokens_seen": 48037888 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976930792377132, + "loss": 4.698, + "theoretical_loss": 5.3096826437553, + "tokens_seen": 48103424 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976830491474424, + "loss": 4.6241, + "theoretical_loss": 5.308577041420046, + "tokens_seen": 48168960 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976730190571715, + "loss": 4.6452, + "theoretical_loss": 5.3074733628129005, + "tokens_seen": 48234496 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976629889669007, + "loss": 4.6468, + "theoretical_loss": 5.3063716019796665, + "tokens_seen": 48300032 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976529588766299, + "loss": 4.6056, + "theoretical_loss": 5.305271752992619, + "tokens_seen": 48365568 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976429287863591, + "loss": 4.7991, + "theoretical_loss": 5.304173809950358, + "tokens_seen": 48431104 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976328986960883, + "loss": 4.6448, + "theoretical_loss": 5.303077766977653, + "tokens_seen": 48496640 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976228686058175, + "loss": 4.6151, + "theoretical_loss": 5.3019836182252895, + "tokens_seen": 48562176 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976128385155466, + "loss": 4.6478, + "theoretical_loss": 5.300891357869929, + "tokens_seen": 48627712 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976028084252758, + "loss": 4.64, + "theoretical_loss": 5.299800980113945, + "tokens_seen": 48693248 + }, + { + "epoch": 0.01, + "learning_rate": 0.000497592778335005, + "loss": 4.4698, + "theoretical_loss": 5.298712479185288, + "tokens_seen": 48758784 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975827482447342, + "loss": 4.6353, + "theoretical_loss": 5.297625849337331, + "tokens_seen": 48824320 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975727181544635, + "loss": 4.619, + "theoretical_loss": 5.296541084848727, + "tokens_seen": 48889856 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975626880641925, + "loss": 4.6679, + "theoretical_loss": 5.295458180023262, + "tokens_seen": 48955392 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975526579739218, + "loss": 4.6945, + "theoretical_loss": 5.294377129189715, + "tokens_seen": 49020928 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975426278836509, + "loss": 4.7258, + "theoretical_loss": 5.293297926701706, + "tokens_seen": 49086464 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 110370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.364441394805908, + "objective/train/theoretical_loss": 5.292220566937567, + "objective/train/tokens_used": 69612000, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975325977933802, + "loss": 4.4168, + "theoretical_loss": 5.292220566937567, + "tokens_seen": 49152000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975225677031094, + "loss": 4.6948, + "theoretical_loss": 5.29114504430019, + "tokens_seen": 49217536 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975125376128386, + "loss": 4.5873, + "theoretical_loss": 5.290071353216895, + "tokens_seen": 49283072 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004975025075225677, + "loss": 4.6435, + "theoretical_loss": 5.288999488139284, + "tokens_seen": 49348608 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974924774322969, + "loss": 4.5103, + "theoretical_loss": 5.28792944354311, + "tokens_seen": 49414144 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004974824473420261, + "loss": 4.6939, + "theoretical_loss": 5.286861213928137, + "tokens_seen": 49479680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974724172517553, + "loss": 4.5337, + "theoretical_loss": 5.285794793817999, + "tokens_seen": 49545216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974623871614845, + "loss": 4.5368, + "theoretical_loss": 5.284730177760077, + "tokens_seen": 49610752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974523570712136, + "loss": 4.4568, + "theoretical_loss": 5.283667360325351, + "tokens_seen": 49676288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974423269809428, + "loss": 4.6428, + "theoretical_loss": 5.2826063361082785, + "tokens_seen": 49741824 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497432296890672, + "loss": 4.4963, + "theoretical_loss": 5.281547099726654, + "tokens_seen": 49807360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974222668004012, + "loss": 4.6042, + "theoretical_loss": 5.280489645821483, + "tokens_seen": 49872896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974122367101304, + "loss": 4.6027, + "theoretical_loss": 5.279433969056848, + "tokens_seen": 49938432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974022066198595, + "loss": 4.5878, + "theoretical_loss": 5.278380064119782, + "tokens_seen": 50003968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973921765295888, + "loss": 4.5084, + "theoretical_loss": 5.277327925720137, + "tokens_seen": 50069504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973821464393179, + "loss": 4.5762, + "theoretical_loss": 5.276277548590457, + "tokens_seen": 50135040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973721163490472, + "loss": 4.5497, + "theoretical_loss": 5.275228927485855, + "tokens_seen": 50200576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973620862587763, + "loss": 4.5808, + "theoretical_loss": 5.2741820571838804, + "tokens_seen": 50266112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973520561685056, + "loss": 4.5335, + "theoretical_loss": 5.273136932484399, + "tokens_seen": 50331648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973420260782347, + "loss": 4.4173, + "theoretical_loss": 5.272093548209467, + "tokens_seen": 50397184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973319959879639, + "loss": 4.6508, + "theoretical_loss": 5.271051899203207, + "tokens_seen": 50462720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973219658976931, + "loss": 4.3289, + "theoretical_loss": 5.270011980331685, + "tokens_seen": 50528256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973119358074223, + "loss": 4.7097, + "theoretical_loss": 5.268973786482794, + "tokens_seen": 50593792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973019057171515, + "loss": 4.5142, + "theoretical_loss": 5.267937312566123, + "tokens_seen": 50659328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972918756268806, + "loss": 4.6685, + "theoretical_loss": 5.266902553512847, + "tokens_seen": 50724864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 113141, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.633150100708008, + "objective/train/theoretical_loss": 5.265869504275602, + "objective/train/tokens_used": 71250400, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972818455366098, + "loss": 4.6543, + "theoretical_loss": 5.265869504275602, + "tokens_seen": 50790400 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497271815446339, + "loss": 4.4724, + "theoretical_loss": 5.264838159828369, + "tokens_seen": 50855936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972617853560682, + "loss": 4.6157, + "theoretical_loss": 5.263808515166355, + "tokens_seen": 50921472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972517552657974, + "loss": 4.4663, + "theoretical_loss": 5.262780565305875, + "tokens_seen": 50987008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972417251755266, + "loss": 4.5934, + "theoretical_loss": 5.261754305284241, + "tokens_seen": 51052544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972316950852557, + "loss": 4.5196, + "theoretical_loss": 5.260729730159641, + "tokens_seen": 51118080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972216649949849, + "loss": 4.5725, + "theoretical_loss": 5.259706835011027, + "tokens_seen": 51183616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972116349047142, + "loss": 4.5552, + "theoretical_loss": 5.2586856149380035, + "tokens_seen": 51249152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972016048144433, + "loss": 4.6341, + "theoretical_loss": 5.257666065060709, + "tokens_seen": 51314688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971915747241726, + "loss": 4.5182, + "theoretical_loss": 5.256648180519708, + "tokens_seen": 51380224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971815446339017, + "loss": 4.3385, + "theoretical_loss": 5.255631956475881, + "tokens_seen": 51445760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971715145436309, + "loss": 4.5573, + "theoretical_loss": 5.25461738811031, + "tokens_seen": 51511296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971614844533601, + "loss": 4.4789, + "theoretical_loss": 5.25360447062417, + "tokens_seen": 51576832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971514543630893, + "loss": 4.4235, + "theoretical_loss": 5.252593199238619, + "tokens_seen": 51642368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971414242728185, + "loss": 4.5234, + "theoretical_loss": 5.2515835691946915, + "tokens_seen": 51707904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971313941825477, + "loss": 4.4538, + "theoretical_loss": 5.2505755757531904, + "tokens_seen": 51773440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971213640922768, + "loss": 4.5667, + "theoretical_loss": 5.24956921419458, + "tokens_seen": 51838976 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497111334002006, + "loss": 4.6083, + "theoretical_loss": 5.248564479818876, + "tokens_seen": 51904512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971013039117352, + "loss": 4.3852, + "theoretical_loss": 5.247561367945544, + "tokens_seen": 51970048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970912738214644, + "loss": 4.5429, + "theoretical_loss": 5.246559873913396, + "tokens_seen": 52035584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970812437311936, + "loss": 4.4612, + "theoretical_loss": 5.245559993080484, + "tokens_seen": 52101120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970712136409227, + "loss": 4.6224, + "theoretical_loss": 5.24456172082399, + "tokens_seen": 52166656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970611835506519, + "loss": 4.4881, + "theoretical_loss": 5.243565052540136, + "tokens_seen": 52232192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970511534603811, + "loss": 4.3186, + "theoretical_loss": 5.242569983644074, + "tokens_seen": 52297728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970411233701103, + "loss": 4.4209, + "theoretical_loss": 5.241576509569784, + "tokens_seen": 52363264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 115841, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.285571098327637, + "objective/train/theoretical_loss": 5.240584625769978, + "objective/train/tokens_used": 72888800, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970310932798396, + "loss": 4.5032, + "theoretical_loss": 5.240584625769978, + "tokens_seen": 52428800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970210631895686, + "loss": 4.3593, + "theoretical_loss": 5.239594327715992, + "tokens_seen": 52494336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970110330992979, + "loss": 4.5219, + "theoretical_loss": 5.238605610897698, + "tokens_seen": 52559872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970010030090271, + "loss": 4.3605, + "theoretical_loss": 5.237618470823394, + "tokens_seen": 52625408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969909729187563, + "loss": 4.48, + "theoretical_loss": 5.2366329030197125, + "tokens_seen": 52690944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969809428284855, + "loss": 4.4636, + "theoretical_loss": 5.235648903031521, + "tokens_seen": 52756480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969709127382147, + "loss": 4.482, + "theoretical_loss": 5.2346664664218245, + "tokens_seen": 52822016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969608826479438, + "loss": 4.47, + "theoretical_loss": 5.233685588771669, + "tokens_seen": 52887552 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496950852557673, + "loss": 4.348, + "theoretical_loss": 5.232706265680049, + "tokens_seen": 52953088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969408224674022, + "loss": 4.537, + "theoretical_loss": 5.231728492763811, + "tokens_seen": 53018624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969307923771314, + "loss": 4.4114, + "theoretical_loss": 5.230752265657554, + "tokens_seen": 53084160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969207622868606, + "loss": 4.5096, + "theoretical_loss": 5.229777580013545, + "tokens_seen": 53149696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004969107321965897, + "loss": 4.0917, + "theoretical_loss": 5.228804431501619, + "tokens_seen": 53215232 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496900702106319, + "loss": 4.4378, + "theoretical_loss": 5.227832815809087, + "tokens_seen": 53280768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968906720160481, + "loss": 4.5288, + "theoretical_loss": 5.226862728640651, + "tokens_seen": 53346304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968806419257774, + "loss": 4.4157, + "theoretical_loss": 5.2258941657183, + "tokens_seen": 53411840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968706118355065, + "loss": 4.5261, + "theoretical_loss": 5.2249271227812315, + "tokens_seen": 53477376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968605817452358, + "loss": 4.3307, + "theoretical_loss": 5.223961595585755, + "tokens_seen": 53542912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968505516549649, + "loss": 4.4123, + "theoretical_loss": 5.222997579905204, + "tokens_seen": 53608448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968405215646941, + "loss": 4.2908, + "theoretical_loss": 5.222035071529845, + "tokens_seen": 53673984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968304914744233, + "loss": 4.4255, + "theoretical_loss": 5.2210740662667945, + "tokens_seen": 53739520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968204613841525, + "loss": 4.5236, + "theoretical_loss": 5.220114559939923, + "tokens_seen": 53805056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968104312938817, + "loss": 4.643, + "theoretical_loss": 5.219156548389775, + "tokens_seen": 53870592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968004012036108, + "loss": 4.4154, + "theoretical_loss": 5.218200027473481, + "tokens_seen": 53936128 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049679037111334, + "loss": 4.3132, + "theoretical_loss": 5.217244993064664, + "tokens_seen": 54001664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 118581, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.1985626220703125, + "objective/train/theoretical_loss": 5.216291441053366, + "objective/train/tokens_used": 74527200, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967803410230692, + "loss": 4.4553, + "theoretical_loss": 5.216291441053366, + "tokens_seen": 54067200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967703109327984, + "loss": 4.4407, + "theoretical_loss": 5.215339367345955, + "tokens_seen": 54132736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967602808425276, + "loss": 4.5843, + "theoretical_loss": 5.214388767865036, + "tokens_seen": 54198272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967502507522568, + "loss": 4.2886, + "theoretical_loss": 5.2134396385493815, + "tokens_seen": 54263808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967402206619859, + "loss": 4.4615, + "theoretical_loss": 5.212491975353835, + "tokens_seen": 54329344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967301905717151, + "loss": 4.3637, + "theoretical_loss": 5.211545774249233, + "tokens_seen": 54394880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967201604814444, + "loss": 4.5198, + "theoretical_loss": 5.210601031222324, + "tokens_seen": 54460416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967101303911735, + "loss": 4.1755, + "theoretical_loss": 5.209657742275683, + "tokens_seen": 54525952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004967001003009028, + "loss": 4.1122, + "theoretical_loss": 5.208715903427631, + "tokens_seen": 54591488 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496690070210632, + "loss": 4.5612, + "theoretical_loss": 5.207775510712159, + "tokens_seen": 54657024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966800401203611, + "loss": 4.2206, + "theoretical_loss": 5.2068365601788384, + "tokens_seen": 54722560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966700100300903, + "loss": 4.3144, + "theoretical_loss": 5.205899047892753, + "tokens_seen": 54788096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966599799398195, + "loss": 4.3646, + "theoretical_loss": 5.2049629699344075, + "tokens_seen": 54853632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966499498495487, + "loss": 4.4599, + "theoretical_loss": 5.204028322399658, + "tokens_seen": 54919168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966399197592779, + "loss": 4.3735, + "theoretical_loss": 5.203095101399628, + "tokens_seen": 54984704 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496629889669007, + "loss": 4.3323, + "theoretical_loss": 5.202163303060633, + "tokens_seen": 55050240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966198595787362, + "loss": 4.3846, + "theoretical_loss": 5.201232923524104, + "tokens_seen": 55115776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966098294884654, + "loss": 4.386, + "theoretical_loss": 5.20030395894651, + "tokens_seen": 55181312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965997993981946, + "loss": 4.3547, + "theoretical_loss": 5.199376405499277, + "tokens_seen": 55246848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965897693079238, + "loss": 4.4294, + "theoretical_loss": 5.198450259368721, + "tokens_seen": 55312384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965797392176529, + "loss": 4.227, + "theoretical_loss": 5.197525516755965, + "tokens_seen": 55377920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965697091273821, + "loss": 4.2706, + "theoretical_loss": 5.196602173876867, + "tokens_seen": 55443456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965596790371113, + "loss": 4.4081, + "theoretical_loss": 5.195680226961947, + "tokens_seen": 55508992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965496489468405, + "loss": 4.2608, + "theoretical_loss": 5.194759672256309, + "tokens_seen": 55574528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965396188565698, + "loss": 4.2693, + "theoretical_loss": 5.19384050601957, + "tokens_seen": 55640064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 121212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.23940372467041, + "objective/train/theoretical_loss": 5.192922724525789, + "objective/train/tokens_used": 76165600, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965295887662988, + "loss": 4.3324, + "theoretical_loss": 5.192922724525789, + "tokens_seen": 55705600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965195586760281, + "loss": 4.3729, + "theoretical_loss": 5.19200632406339, + "tokens_seen": 55771136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965095285857573, + "loss": 4.4426, + "theoretical_loss": 5.19109130093509, + "tokens_seen": 55836672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964994984954865, + "loss": 4.2226, + "theoretical_loss": 5.190177651457833, + "tokens_seen": 55902208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964894684052157, + "loss": 4.2945, + "theoretical_loss": 5.189265371962712, + "tokens_seen": 55967744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964794383149449, + "loss": 4.3103, + "theoretical_loss": 5.188354458794902, + "tokens_seen": 56033280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496469408224674, + "loss": 4.2649, + "theoretical_loss": 5.187444908313586, + "tokens_seen": 56098816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964593781344032, + "loss": 4.4507, + "theoretical_loss": 5.186536716891892, + "tokens_seen": 56164352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964493480441324, + "loss": 4.3774, + "theoretical_loss": 5.185629880916814, + "tokens_seen": 56229888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964393179538616, + "loss": 4.3573, + "theoretical_loss": 5.18472439678915, + "tokens_seen": 56295424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964292878635908, + "loss": 4.136, + "theoretical_loss": 5.18382026092343, + "tokens_seen": 56360960 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049641925777332, + "loss": 4.3191, + "theoretical_loss": 5.182917469747851, + "tokens_seen": 56426496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964092276830491, + "loss": 4.4761, + "theoretical_loss": 5.182016019704204, + "tokens_seen": 56492032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963991975927783, + "loss": 4.4716, + "theoretical_loss": 5.1811159072478095, + "tokens_seen": 56557568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963891675025075, + "loss": 4.1831, + "theoretical_loss": 5.180217128847451, + "tokens_seen": 56623104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963791374122367, + "loss": 4.4563, + "theoretical_loss": 5.17931968098531, + "tokens_seen": 56688640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963691073219659, + "loss": 4.3653, + "theoretical_loss": 5.178423560156894, + "tokens_seen": 56754176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963590772316951, + "loss": 4.3067, + "theoretical_loss": 5.177528762870973, + "tokens_seen": 56819712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963490471414242, + "loss": 4.2897, + "theoretical_loss": 5.176635285649521, + "tokens_seen": 56885248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963390170511535, + "loss": 4.2636, + "theoretical_loss": 5.175743125027638, + "tokens_seen": 56950784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963289869608827, + "loss": 4.5011, + "theoretical_loss": 5.174852277553498, + "tokens_seen": 57016320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963189568706119, + "loss": 4.2683, + "theoretical_loss": 5.173962739788276, + "tokens_seen": 57081856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496308926780341, + "loss": 4.0628, + "theoretical_loss": 5.17307450830609, + "tokens_seen": 57147392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962988966900702, + "loss": 4.2355, + "theoretical_loss": 5.172187579693933, + "tokens_seen": 57212928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962888665997994, + "loss": 4.4906, + "theoretical_loss": 5.1713019505516105, + "tokens_seen": 57278464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 122670, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.318225860595703, + "objective/train/theoretical_loss": 5.170417617491682, + "objective/train/tokens_used": 77804000, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962788365095286, + "loss": 4.1734, + "theoretical_loss": 5.170417617491682, + "tokens_seen": 57344000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962688064192578, + "loss": 4.3321, + "theoretical_loss": 5.169534577139395, + "tokens_seen": 57409536 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496258776328987, + "loss": 4.1039, + "theoretical_loss": 5.168652826132623, + "tokens_seen": 57475072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962487462387161, + "loss": 4.1895, + "theoretical_loss": 5.167772361121805, + "tokens_seen": 57540608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962387161484453, + "loss": 4.2033, + "theoretical_loss": 5.166893178769884, + "tokens_seen": 57606144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962286860581746, + "loss": 4.0604, + "theoretical_loss": 5.1660152757522475, + "tokens_seen": 57671680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962186559679037, + "loss": 4.1823, + "theoretical_loss": 5.165138648756665, + "tokens_seen": 57737216 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496208625877633, + "loss": 4.2039, + "theoretical_loss": 5.164263294483226, + "tokens_seen": 57802752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961985957873621, + "loss": 4.2198, + "theoretical_loss": 5.163389209644287, + "tokens_seen": 57868288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961885656970913, + "loss": 4.3944, + "theoretical_loss": 5.162516390964408, + "tokens_seen": 57933824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961785356068205, + "loss": 4.2938, + "theoretical_loss": 5.1616448351802875, + "tokens_seen": 57999360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961685055165497, + "loss": 4.2794, + "theoretical_loss": 5.160774539040716, + "tokens_seen": 58064896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961584754262789, + "loss": 4.4463, + "theoretical_loss": 5.159905499306511, + "tokens_seen": 58130432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961484453360081, + "loss": 4.1833, + "theoretical_loss": 5.159037712750455, + "tokens_seen": 58195968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961384152457372, + "loss": 4.1665, + "theoretical_loss": 5.158171176157245, + "tokens_seen": 58261504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961283851554664, + "loss": 4.2758, + "theoretical_loss": 5.157305886323435, + "tokens_seen": 58327040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961183550651956, + "loss": 4.2675, + "theoretical_loss": 5.156441840057371, + "tokens_seen": 58392576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961083249749248, + "loss": 4.2078, + "theoretical_loss": 5.155579034179144, + "tokens_seen": 58458112 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496098294884654, + "loss": 4.2221, + "theoretical_loss": 5.15471746552053, + "tokens_seen": 58523648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960882647943831, + "loss": 4.1744, + "theoretical_loss": 5.153857130924929, + "tokens_seen": 58589184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960782347041123, + "loss": 4.3101, + "theoretical_loss": 5.1529980272473175, + "tokens_seen": 58654720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960682046138415, + "loss": 4.2873, + "theoretical_loss": 5.152140151354191, + "tokens_seen": 58720256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960581745235707, + "loss": 4.1584, + "theoretical_loss": 5.151283500123505, + "tokens_seen": 58785792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960481444333, + "loss": 4.2865, + "theoretical_loss": 5.150428070444621, + "tokens_seen": 58851328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496038114343029, + "loss": 4.4394, + "theoretical_loss": 5.149573859218261, + "tokens_seen": 58916864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 125600, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.007076740264893, + "objective/train/theoretical_loss": 5.1487208633564405, + "objective/train/tokens_used": 79442400, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960280842527583, + "loss": 4.233, + "theoretical_loss": 5.1487208633564405, + "tokens_seen": 58982400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960180541624875, + "loss": 4.1681, + "theoretical_loss": 5.147869079782423, + "tokens_seen": 59047936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004960080240722167, + "loss": 4.1255, + "theoretical_loss": 5.147018505430666, + "tokens_seen": 59113472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959979939819459, + "loss": 4.201, + "theoretical_loss": 5.146169137246765, + "tokens_seen": 59179008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959879638916751, + "loss": 4.2392, + "theoretical_loss": 5.145320972187402, + "tokens_seen": 59244544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959779338014042, + "loss": 3.9711, + "theoretical_loss": 5.144474007220293, + "tokens_seen": 59310080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959679037111334, + "loss": 4.4679, + "theoretical_loss": 5.143628239324139, + "tokens_seen": 59375616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959578736208626, + "loss": 4.3305, + "theoretical_loss": 5.142783665488567, + "tokens_seen": 59441152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959478435305918, + "loss": 4.2006, + "theoretical_loss": 5.1419402827140885, + "tokens_seen": 59506688 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495937813440321, + "loss": 4.1907, + "theoretical_loss": 5.141098088012036, + "tokens_seen": 59572224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959277833500501, + "loss": 4.2224, + "theoretical_loss": 5.140257078404524, + "tokens_seen": 59637760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959177532597793, + "loss": 4.367, + "theoretical_loss": 5.13941725092439, + "tokens_seen": 59703296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959077231695085, + "loss": 4.1563, + "theoretical_loss": 5.138578602615146, + "tokens_seen": 59768832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958976930792377, + "loss": 4.1792, + "theoretical_loss": 5.137741130530934, + "tokens_seen": 59834368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958876629889669, + "loss": 4.0723, + "theoretical_loss": 5.1369048317364685, + "tokens_seen": 59899904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495877632898696, + "loss": 4.183, + "theoretical_loss": 5.13606970330699, + "tokens_seen": 59965440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958676028084253, + "loss": 4.4316, + "theoretical_loss": 5.135235742328217, + "tokens_seen": 60030976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958575727181544, + "loss": 4.1327, + "theoretical_loss": 5.134402945896297, + "tokens_seen": 60096512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958475426278837, + "loss": 4.2493, + "theoretical_loss": 5.133571311117755, + "tokens_seen": 60162048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958375125376129, + "loss": 4.3176, + "theoretical_loss": 5.132740835109448, + "tokens_seen": 60227584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958274824473421, + "loss": 4.2086, + "theoretical_loss": 5.131911514998518, + "tokens_seen": 60293120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958174523570712, + "loss": 4.285, + "theoretical_loss": 5.131083347922338, + "tokens_seen": 60358656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958074222668004, + "loss": 3.9992, + "theoretical_loss": 5.130256331028474, + "tokens_seen": 60424192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957973921765296, + "loss": 4.224, + "theoretical_loss": 5.129430461474628, + "tokens_seen": 60489728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957873620862588, + "loss": 4.2686, + "theoretical_loss": 5.128605736428597, + "tokens_seen": 60555264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 128428, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.303081512451172, + "objective/train/theoretical_loss": 5.127782153068225, + "objective/train/tokens_used": 81080800, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495777331995988, + "loss": 4.2848, + "theoretical_loss": 5.127782153068225, + "tokens_seen": 60620800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957673019057172, + "loss": 4.0649, + "theoretical_loss": 5.126959708581356, + "tokens_seen": 60686336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957572718154463, + "loss": 4.2269, + "theoretical_loss": 5.1261384001657895, + "tokens_seen": 60751872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957472417251755, + "loss": 4.1762, + "theoretical_loss": 5.125318225029231, + "tokens_seen": 60817408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957372116349047, + "loss": 4.1618, + "theoretical_loss": 5.124499180389249, + "tokens_seen": 60882944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957271815446339, + "loss": 4.1496, + "theoretical_loss": 5.12368126347323, + "tokens_seen": 60948480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957171514543631, + "loss": 4.2578, + "theoretical_loss": 5.122864471518334, + "tokens_seen": 61014016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957071213640923, + "loss": 4.1718, + "theoretical_loss": 5.122048801771443, + "tokens_seen": 61079552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956970912738214, + "loss": 3.8569, + "theoretical_loss": 5.121234251489128, + "tokens_seen": 61145088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956870611835507, + "loss": 4.1752, + "theoretical_loss": 5.120420817937591, + "tokens_seen": 61210624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956770310932798, + "loss": 4.0605, + "theoretical_loss": 5.119608498392633, + "tokens_seen": 61276160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956670010030091, + "loss": 4.0987, + "theoretical_loss": 5.118797290139605, + "tokens_seen": 61341696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956569709127383, + "loss": 4.1523, + "theoretical_loss": 5.117987190473361, + "tokens_seen": 61407232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956469408224674, + "loss": 4.1486, + "theoretical_loss": 5.1171781966982195, + "tokens_seen": 61472768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956369107321966, + "loss": 4.0387, + "theoretical_loss": 5.116370306127921, + "tokens_seen": 61538304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956268806419258, + "loss": 4.0154, + "theoretical_loss": 5.11556351608558, + "tokens_seen": 61603840 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495616850551655, + "loss": 4.1014, + "theoretical_loss": 5.114757823903647, + "tokens_seen": 61669376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956068204613842, + "loss": 4.0218, + "theoretical_loss": 5.113953226923864, + "tokens_seen": 61734912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955967903711133, + "loss": 4.2676, + "theoretical_loss": 5.113149722497221, + "tokens_seen": 61800448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955867602808425, + "loss": 4.0342, + "theoretical_loss": 5.112347307983919, + "tokens_seen": 61865984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955767301905717, + "loss": 4.0808, + "theoretical_loss": 5.111545980753322, + "tokens_seen": 61931520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955667001003009, + "loss": 4.3096, + "theoretical_loss": 5.110745738183919, + "tokens_seen": 61997056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955566700100301, + "loss": 4.0913, + "theoretical_loss": 5.109946577663284, + "tokens_seen": 62062592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955466399197592, + "loss": 4.1592, + "theoretical_loss": 5.109148496588032, + "tokens_seen": 62128128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955366098294884, + "loss": 4.0169, + "theoretical_loss": 5.108351492363779, + "tokens_seen": 62193664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 131355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.096719741821289, + "objective/train/theoretical_loss": 5.107555562405102, + "objective/train/tokens_used": 82719200, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955265797392177, + "loss": 4.0798, + "theoretical_loss": 5.107555562405102, + "tokens_seen": 62259200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955165496489468, + "loss": 4.074, + "theoretical_loss": 5.106760704135499, + "tokens_seen": 62324736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955065195586761, + "loss": 4.1596, + "theoretical_loss": 5.105966914987349, + "tokens_seen": 62390272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954964894684052, + "loss": 4.0869, + "theoretical_loss": 5.1051741924018685, + "tokens_seen": 62455808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954864593781344, + "loss": 4.0943, + "theoretical_loss": 5.10438253382908, + "tokens_seen": 62521344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954764292878636, + "loss": 4.2557, + "theoretical_loss": 5.103591936727762, + "tokens_seen": 62586880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954663991975928, + "loss": 4.1231, + "theoretical_loss": 5.102802398565418, + "tokens_seen": 62652416 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495456369107322, + "loss": 4.0894, + "theoretical_loss": 5.102013916818235, + "tokens_seen": 62717952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954463390170512, + "loss": 4.1925, + "theoretical_loss": 5.101226488971042, + "tokens_seen": 62783488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954363089267803, + "loss": 4.2879, + "theoretical_loss": 5.100440112517276, + "tokens_seen": 62849024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954262788365095, + "loss": 4.1193, + "theoretical_loss": 5.09965478495894, + "tokens_seen": 62914560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954162487462387, + "loss": 4.1428, + "theoretical_loss": 5.098870503806567, + "tokens_seen": 62980096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954062186559679, + "loss": 4.0939, + "theoretical_loss": 5.09808726657918, + "tokens_seen": 63045632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953961885656971, + "loss": 4.2238, + "theoretical_loss": 5.097305070804255, + "tokens_seen": 63111168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953861584754263, + "loss": 4.1532, + "theoretical_loss": 5.096523914017688, + "tokens_seen": 63176704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953761283851555, + "loss": 4.0169, + "theoretical_loss": 5.095743793763747, + "tokens_seen": 63242240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953660982948846, + "loss": 4.1325, + "theoretical_loss": 5.094964707595047, + "tokens_seen": 63307776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953560682046139, + "loss": 4.1509, + "theoretical_loss": 5.094186653072505, + "tokens_seen": 63373312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953460381143431, + "loss": 4.0627, + "theoretical_loss": 5.093409627765306, + "tokens_seen": 63438848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953360080240723, + "loss": 4.1483, + "theoretical_loss": 5.092633629250866, + "tokens_seen": 63504384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953259779338014, + "loss": 4.0753, + "theoretical_loss": 5.091858655114796, + "tokens_seen": 63569920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953159478435306, + "loss": 4.1402, + "theoretical_loss": 5.091084702950868, + "tokens_seen": 63635456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004953059177532598, + "loss": 4.1114, + "theoretical_loss": 5.090311770360971, + "tokens_seen": 63700992 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495295887662989, + "loss": 4.1984, + "theoretical_loss": 5.089539854955088, + "tokens_seen": 63766528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952858575727182, + "loss": 4.2246, + "theoretical_loss": 5.088768954351249, + "tokens_seen": 63832064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 134120, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.254392623901367, + "objective/train/theoretical_loss": 5.087999066175502, + "objective/train/tokens_used": 84357600, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952758274824474, + "loss": 4.2577, + "theoretical_loss": 5.087999066175502, + "tokens_seen": 63897600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952657973921765, + "loss": 4.1643, + "theoretical_loss": 5.0872301880618735, + "tokens_seen": 63963136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952557673019057, + "loss": 4.1536, + "theoretical_loss": 5.086462317652341, + "tokens_seen": 64028672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952457372116349, + "loss": 4.0992, + "theoretical_loss": 5.085695452596788, + "tokens_seen": 64094208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952357071213641, + "loss": 4.1679, + "theoretical_loss": 5.084929590552976, + "tokens_seen": 64159744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952256770310933, + "loss": 3.9607, + "theoretical_loss": 5.0841647291865115, + "tokens_seen": 64225280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952156469408225, + "loss": 4.2437, + "theoretical_loss": 5.083400866170806, + "tokens_seen": 64290816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952056168505516, + "loss": 3.962, + "theoretical_loss": 5.082637999187046, + "tokens_seen": 64356352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951955867602809, + "loss": 4.1419, + "theoretical_loss": 5.081876125924159, + "tokens_seen": 64421888 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049518555667001, + "loss": 4.1098, + "theoretical_loss": 5.0811152440787755, + "tokens_seen": 64487424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951755265797393, + "loss": 4.1977, + "theoretical_loss": 5.0803553513552036, + "tokens_seen": 64552960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951654964894685, + "loss": 4.1363, + "theoretical_loss": 5.079596445465386, + "tokens_seen": 64618496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951554663991976, + "loss": 4.2408, + "theoretical_loss": 5.078838524128878, + "tokens_seen": 64684032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951454363089268, + "loss": 4.0796, + "theoretical_loss": 5.078081585072802, + "tokens_seen": 64749568 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495135406218656, + "loss": 4.1091, + "theoretical_loss": 5.077325626031826, + "tokens_seen": 64815104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951253761283852, + "loss": 4.0651, + "theoretical_loss": 5.076570644748123, + "tokens_seen": 64880640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951153460381144, + "loss": 4.1498, + "theoretical_loss": 5.075816638971341, + "tokens_seen": 64946176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951053159478435, + "loss": 4.0684, + "theoretical_loss": 5.075063606458576, + "tokens_seen": 65011712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950952858575727, + "loss": 4.1423, + "theoretical_loss": 5.074311544974331, + "tokens_seen": 65077248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950852557673019, + "loss": 3.9332, + "theoretical_loss": 5.07356045229049, + "tokens_seen": 65142784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950752256770311, + "loss": 3.9928, + "theoretical_loss": 5.072810326186285, + "tokens_seen": 65208320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950651955867603, + "loss": 4.22, + "theoretical_loss": 5.072061164448261, + "tokens_seen": 65273856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950551654964894, + "loss": 4.0175, + "theoretical_loss": 5.071312964870252, + "tokens_seen": 65339392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950451354062186, + "loss": 4.0033, + "theoretical_loss": 5.070565725253344, + "tokens_seen": 65404928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950351053159479, + "loss": 4.1901, + "theoretical_loss": 5.069819443405842, + "tokens_seen": 65470464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 137067, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.357162952423096, + "objective/train/theoretical_loss": 5.069074117143246, + "objective/train/tokens_used": 85996000, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.000495025075225677, + "loss": 4.1989, + "theoretical_loss": 5.069074117143246, + "tokens_seen": 65536000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950150451354063, + "loss": 4.1362, + "theoretical_loss": 5.068329744288216, + "tokens_seen": 65601536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950050150451354, + "loss": 4.264, + "theoretical_loss": 5.067586322670541, + "tokens_seen": 65667072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949949849548646, + "loss": 4.1825, + "theoretical_loss": 5.0668438501271105, + "tokens_seen": 65732608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949849548645938, + "loss": 4.0695, + "theoretical_loss": 5.066102324501883, + "tokens_seen": 65798144 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494974924774323, + "loss": 4.1155, + "theoretical_loss": 5.065361743645855, + "tokens_seen": 65863680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949648946840522, + "loss": 3.9243, + "theoretical_loss": 5.064622105417033, + "tokens_seen": 65929216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949548645937814, + "loss": 4.0018, + "theoretical_loss": 5.063883407680405, + "tokens_seen": 65994752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949448345035105, + "loss": 4.1605, + "theoretical_loss": 5.063145648307904, + "tokens_seen": 66060288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949348044132397, + "loss": 3.9984, + "theoretical_loss": 5.062408825178388, + "tokens_seen": 66125824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949247743229689, + "loss": 4.1201, + "theoretical_loss": 5.061672936177604, + "tokens_seen": 66191360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949147442326981, + "loss": 3.9848, + "theoretical_loss": 5.06093797919816, + "tokens_seen": 66256896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949047141424273, + "loss": 4.0673, + "theoretical_loss": 5.060203952139497, + "tokens_seen": 66322432 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948946840521565, + "loss": 3.9158, + "theoretical_loss": 5.059470852907861, + "tokens_seen": 66387968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948846539618856, + "loss": 4.1779, + "theoretical_loss": 5.0587386794162725, + "tokens_seen": 66453504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948746238716148, + "loss": 4.2047, + "theoretical_loss": 5.058007429584498, + "tokens_seen": 66519040 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494864593781344, + "loss": 4.2382, + "theoretical_loss": 5.057277101339023, + "tokens_seen": 66584576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948545636910733, + "loss": 3.9197, + "theoretical_loss": 5.056547692613021, + "tokens_seen": 66650112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948445336008024, + "loss": 3.9336, + "theoretical_loss": 5.055819201346331, + "tokens_seen": 66715648 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948345035105316, + "loss": 4.1466, + "theoretical_loss": 5.055091625485421, + "tokens_seen": 66781184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948244734202607, + "loss": 4.2296, + "theoretical_loss": 5.054364962983367, + "tokens_seen": 66846720 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049481444332999, + "loss": 4.1142, + "theoretical_loss": 5.053639211799824, + "tokens_seen": 66912256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948044132397192, + "loss": 4.0936, + "theoretical_loss": 5.052914369900997, + "tokens_seen": 66977792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947943831494484, + "loss": 4.0154, + "theoretical_loss": 5.052190435259614, + "tokens_seen": 67043328 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947843530591776, + "loss": 4.1844, + "theoretical_loss": 5.051467405854897, + "tokens_seen": 67108864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 139457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.18411111831665, + "objective/train/theoretical_loss": 5.05074527967254, + "objective/train/tokens_used": 87634400, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947743229689067, + "loss": 3.984, + "theoretical_loss": 5.05074527967254, + "tokens_seen": 67174400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947642928786359, + "loss": 3.9318, + "theoretical_loss": 5.050024054704677, + "tokens_seen": 67239936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947542627883651, + "loss": 3.9733, + "theoretical_loss": 5.049303728949859, + "tokens_seen": 67305472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947442326980943, + "loss": 4.1703, + "theoretical_loss": 5.048584300413019, + "tokens_seen": 67371008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947342026078235, + "loss": 3.9338, + "theoretical_loss": 5.04786576710546, + "tokens_seen": 67436544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947241725175527, + "loss": 4.1424, + "theoretical_loss": 5.0471481270448155, + "tokens_seen": 67502080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947141424272818, + "loss": 4.1751, + "theoretical_loss": 5.046431378255027, + "tokens_seen": 67567616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947041123370111, + "loss": 4.0552, + "theoretical_loss": 5.045715518766322, + "tokens_seen": 67633152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946940822467402, + "loss": 4.2581, + "theoretical_loss": 5.0450005466151815, + "tokens_seen": 67698688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946840521564695, + "loss": 3.9804, + "theoretical_loss": 5.044286459844319, + "tokens_seen": 67764224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946740220661987, + "loss": 4.1464, + "theoretical_loss": 5.043573256502652, + "tokens_seen": 67829760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946639919759278, + "loss": 4.2229, + "theoretical_loss": 5.0428609346452795, + "tokens_seen": 67895296 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494653961885657, + "loss": 4.2558, + "theoretical_loss": 5.042149492333452, + "tokens_seen": 67960832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946439317953862, + "loss": 4.1359, + "theoretical_loss": 5.041438927634549, + "tokens_seen": 68026368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946339017051154, + "loss": 4.1385, + "theoretical_loss": 5.040729238622053, + "tokens_seen": 68091904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946238716148446, + "loss": 4.0283, + "theoretical_loss": 5.040020423375525, + "tokens_seen": 68157440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946138415245737, + "loss": 4.1403, + "theoretical_loss": 5.039312479980579, + "tokens_seen": 68222976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946038114343029, + "loss": 3.9614, + "theoretical_loss": 5.038605406528857, + "tokens_seen": 68288512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945937813440321, + "loss": 4.1334, + "theoretical_loss": 5.037899201118005, + "tokens_seen": 68354048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945837512537613, + "loss": 3.8877, + "theoretical_loss": 5.037193861851646, + "tokens_seen": 68419584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945737211634905, + "loss": 3.9056, + "theoretical_loss": 5.03648938683936, + "tokens_seen": 68485120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945636910732196, + "loss": 3.9781, + "theoretical_loss": 5.035785774196654, + "tokens_seen": 68550656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945536609829488, + "loss": 4.2372, + "theoretical_loss": 5.035083022044944, + "tokens_seen": 68616192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945436308926781, + "loss": 4.0348, + "theoretical_loss": 5.034381128511525, + "tokens_seen": 68681728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945336008024072, + "loss": 4.0769, + "theoretical_loss": 5.0336800917295506, + "tokens_seen": 68747264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 142114, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.129088401794434, + "objective/train/theoretical_loss": 5.032979909838007, + "objective/train/tokens_used": 89272800, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945235707121365, + "loss": 4.0085, + "theoretical_loss": 5.032979909838007, + "tokens_seen": 68812800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945135406218656, + "loss": 4.0485, + "theoretical_loss": 5.032280580981691, + "tokens_seen": 68878336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004945035105315948, + "loss": 4.0022, + "theoretical_loss": 5.031582103311187, + "tokens_seen": 68943872 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494493480441324, + "loss": 3.9851, + "theoretical_loss": 5.030884474982842, + "tokens_seen": 69009408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944834503510532, + "loss": 3.9722, + "theoretical_loss": 5.030187694158739, + "tokens_seen": 69074944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944734202607824, + "loss": 4.0643, + "theoretical_loss": 5.02949175900668, + "tokens_seen": 69140480 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944633901705116, + "loss": 3.9744, + "theoretical_loss": 5.028796667700159, + "tokens_seen": 69206016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944533600802407, + "loss": 4.1231, + "theoretical_loss": 5.0281024184183405, + "tokens_seen": 69271552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944433299899699, + "loss": 4.1382, + "theoretical_loss": 5.0274090093460355, + "tokens_seen": 69337088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944332998996991, + "loss": 4.0752, + "theoretical_loss": 5.026716438673677, + "tokens_seen": 69402624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944232698094283, + "loss": 3.8675, + "theoretical_loss": 5.0260247045973045, + "tokens_seen": 69468160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944132397191575, + "loss": 4.0671, + "theoretical_loss": 5.02533380531853, + "tokens_seen": 69533696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944032096288867, + "loss": 4.0056, + "theoretical_loss": 5.024643739044526, + "tokens_seen": 69599232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943931795386158, + "loss": 4.0967, + "theoretical_loss": 5.023954503987998, + "tokens_seen": 69664768 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494383149448345, + "loss": 3.911, + "theoretical_loss": 5.023266098367161, + "tokens_seen": 69730304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943731193580742, + "loss": 4.1397, + "theoretical_loss": 5.022578520405721, + "tokens_seen": 69795840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943630892678035, + "loss": 4.0588, + "theoretical_loss": 5.0218917683328534, + "tokens_seen": 69861376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943530591775326, + "loss": 4.1069, + "theoretical_loss": 5.021205840383175, + "tokens_seen": 69926912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943430290872618, + "loss": 3.8213, + "theoretical_loss": 5.020520734796728, + "tokens_seen": 69992448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943329989969909, + "loss": 4.1406, + "theoretical_loss": 5.019836449818957, + "tokens_seen": 70057984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943229689067202, + "loss": 4.0369, + "theoretical_loss": 5.019152983700687, + "tokens_seen": 70123520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943129388164494, + "loss": 4.0687, + "theoretical_loss": 5.018470334698101, + "tokens_seen": 70189056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004943029087261786, + "loss": 3.6628, + "theoretical_loss": 5.01778850107272, + "tokens_seen": 70254592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942928786359078, + "loss": 3.8906, + "theoretical_loss": 5.017107481091379, + "tokens_seen": 70320128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942828485456369, + "loss": 4.0573, + "theoretical_loss": 5.016427273026212, + "tokens_seen": 70385664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 144953, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.063969612121582, + "objective/train/theoretical_loss": 5.015747875154622, + "objective/train/tokens_used": 90911200, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942728184553661, + "loss": 4.1182, + "theoretical_loss": 5.015747875154622, + "tokens_seen": 70451200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942627883650953, + "loss": 4.0265, + "theoretical_loss": 5.015069285759269, + "tokens_seen": 70516736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942527582748245, + "loss": 4.0582, + "theoretical_loss": 5.01439150312804, + "tokens_seen": 70582272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942427281845537, + "loss": 3.9481, + "theoretical_loss": 5.0137145255540405, + "tokens_seen": 70647808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942326980942828, + "loss": 3.9882, + "theoretical_loss": 5.013038351335559, + "tokens_seen": 70713344 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494222668004012, + "loss": 4.0168, + "theoretical_loss": 5.012362978776057, + "tokens_seen": 70778880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942126379137412, + "loss": 3.8963, + "theoretical_loss": 5.011688406184147, + "tokens_seen": 70844416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942026078234704, + "loss": 4.008, + "theoretical_loss": 5.011014631873566, + "tokens_seen": 70909952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941925777331996, + "loss": 3.9744, + "theoretical_loss": 5.010341654163167, + "tokens_seen": 70975488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941825476429289, + "loss": 3.9934, + "theoretical_loss": 5.009669471376882, + "tokens_seen": 71041024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941725175526579, + "loss": 3.9361, + "theoretical_loss": 5.008998081843721, + "tokens_seen": 71106560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941624874623872, + "loss": 4.0719, + "theoretical_loss": 5.008327483897736, + "tokens_seen": 71172096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941524573721163, + "loss": 3.9044, + "theoretical_loss": 5.00765767587801, + "tokens_seen": 71237632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941424272818456, + "loss": 4.1923, + "theoretical_loss": 5.006988656128635, + "tokens_seen": 71303168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941323971915748, + "loss": 4.0066, + "theoretical_loss": 5.006320422998691, + "tokens_seen": 71368704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941223671013039, + "loss": 4.0912, + "theoretical_loss": 5.00565297484223, + "tokens_seen": 71434240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941123370110331, + "loss": 4.1143, + "theoretical_loss": 5.004986310018252, + "tokens_seen": 71499776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941023069207623, + "loss": 4.0003, + "theoretical_loss": 5.004320426890686, + "tokens_seen": 71565312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940922768304915, + "loss": 3.9739, + "theoretical_loss": 5.003655323828376, + "tokens_seen": 71630848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940822467402207, + "loss": 3.954, + "theoretical_loss": 5.002990999205057, + "tokens_seen": 71696384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940722166499498, + "loss": 3.9475, + "theoretical_loss": 5.002327451399335, + "tokens_seen": 71761920 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494062186559679, + "loss": 3.9981, + "theoretical_loss": 5.001664678794671, + "tokens_seen": 71827456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940521564694082, + "loss": 4.188, + "theoretical_loss": 5.001002679779363, + "tokens_seen": 71892992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940421263791374, + "loss": 3.9514, + "theoretical_loss": 5.0003414527465235, + "tokens_seen": 71958528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940320962888666, + "loss": 3.8605, + "theoretical_loss": 4.99968099609406, + "tokens_seen": 72024064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 146265, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8565335273742676, + "objective/train/theoretical_loss": 4.999021308224664, + "objective/train/tokens_used": 92549600, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940220661985958, + "loss": 3.7912, + "theoretical_loss": 4.999021308224664, + "tokens_seen": 72089600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940120361083249, + "loss": 3.9225, + "theoretical_loss": 4.998362387545782, + "tokens_seen": 72155136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940020060180542, + "loss": 3.9875, + "theoretical_loss": 4.997704232469606, + "tokens_seen": 72220672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939919759277834, + "loss": 4.0243, + "theoretical_loss": 4.997046841413049, + "tokens_seen": 72286208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939819458375126, + "loss": 3.7861, + "theoretical_loss": 4.996390212797728, + "tokens_seen": 72351744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939719157472418, + "loss": 3.7494, + "theoretical_loss": 4.995734345049949, + "tokens_seen": 72417280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493961885656971, + "loss": 4.0262, + "theoretical_loss": 4.995079236600686, + "tokens_seen": 72482816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939518555667001, + "loss": 4.0516, + "theoretical_loss": 4.994424885885564, + "tokens_seen": 72548352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939418254764293, + "loss": 4.2025, + "theoretical_loss": 4.993771291344839, + "tokens_seen": 72613888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939317953861585, + "loss": 4.0091, + "theoretical_loss": 4.993118451423381, + "tokens_seen": 72679424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939217652958877, + "loss": 4.0021, + "theoretical_loss": 4.992466364570659, + "tokens_seen": 72744960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939117352056169, + "loss": 3.9088, + "theoretical_loss": 4.991815029240721, + "tokens_seen": 72810496 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493901705115346, + "loss": 3.9048, + "theoretical_loss": 4.991164443892175, + "tokens_seen": 72876032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938916750250752, + "loss": 3.9719, + "theoretical_loss": 4.990514606988173, + "tokens_seen": 72941568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938816449348044, + "loss": 3.9218, + "theoretical_loss": 4.989865516996396, + "tokens_seen": 73007104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938716148445337, + "loss": 3.9095, + "theoretical_loss": 4.98921717238903, + "tokens_seen": 73072640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938615847542628, + "loss": 3.8863, + "theoretical_loss": 4.988569571642756, + "tokens_seen": 73138176 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493851554663992, + "loss": 4.0022, + "theoretical_loss": 4.98792271323873, + "tokens_seen": 73203712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938415245737211, + "loss": 3.7051, + "theoretical_loss": 4.9872765956625615, + "tokens_seen": 73269248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938314944834504, + "loss": 4.1101, + "theoretical_loss": 4.9866312174043035, + "tokens_seen": 73334784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938214643931796, + "loss": 3.9701, + "theoretical_loss": 4.9859865769584335, + "tokens_seen": 73400320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004938114343029088, + "loss": 3.9967, + "theoretical_loss": 4.9853426728238315, + "tokens_seen": 73465856 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493801404212638, + "loss": 3.926, + "theoretical_loss": 4.984699503503771, + "tokens_seen": 73531392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937913741223671, + "loss": 3.97, + "theoretical_loss": 4.984057067505898, + "tokens_seen": 73596928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937813440320963, + "loss": 3.7532, + "theoretical_loss": 4.9834153633422105, + "tokens_seen": 73662464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 149094, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8529977798461914, + "objective/train/theoretical_loss": 4.982774389529053, + "objective/train/tokens_used": 94188000, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937713139418255, + "loss": 3.9646, + "theoretical_loss": 4.982774389529053, + "tokens_seen": 73728000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937612838515547, + "loss": 4.0559, + "theoretical_loss": 4.9821341445870875, + "tokens_seen": 73793536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937512537612839, + "loss": 3.9924, + "theoretical_loss": 4.981494627041286, + "tokens_seen": 73859072 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493741223671013, + "loss": 3.8326, + "theoretical_loss": 4.98085583542091, + "tokens_seen": 73924608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937311935807422, + "loss": 3.9955, + "theoretical_loss": 4.980217768259496, + "tokens_seen": 73990144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937211634904714, + "loss": 4.04, + "theoretical_loss": 4.979580424094836, + "tokens_seen": 74055680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937111334002006, + "loss": 4.0745, + "theoretical_loss": 4.978943801468967, + "tokens_seen": 74121216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004937011033099298, + "loss": 4.1224, + "theoretical_loss": 4.978307898928149, + "tokens_seen": 74186752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936910732196591, + "loss": 4.0011, + "theoretical_loss": 4.977672715022855, + "tokens_seen": 74252288 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936810431293881, + "loss": 3.8266, + "theoretical_loss": 4.97703824830775, + "tokens_seen": 74317824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936710130391174, + "loss": 3.8387, + "theoretical_loss": 4.976404497341676, + "tokens_seen": 74383360 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936609829488465, + "loss": 3.9344, + "theoretical_loss": 4.975771460687641, + "tokens_seen": 74448896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936509528585758, + "loss": 4.0611, + "theoretical_loss": 4.975139136912794, + "tokens_seen": 74514432 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493640922768305, + "loss": 3.9529, + "theoretical_loss": 4.974507524588424, + "tokens_seen": 74579968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936308926780341, + "loss": 4.0649, + "theoretical_loss": 4.973876622289927, + "tokens_seen": 74645504 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936208625877633, + "loss": 3.8079, + "theoretical_loss": 4.973246428596802, + "tokens_seen": 74711040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936108324974925, + "loss": 3.8539, + "theoretical_loss": 4.972616942092634, + "tokens_seen": 74776576 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936008024072217, + "loss": 3.7971, + "theoretical_loss": 4.971988161365077, + "tokens_seen": 74842112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935907723169509, + "loss": 3.9891, + "theoretical_loss": 4.9713600850058395, + "tokens_seen": 74907648 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049358074222668, + "loss": 3.8772, + "theoretical_loss": 4.970732711610667, + "tokens_seen": 74973184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935707121364092, + "loss": 3.9861, + "theoretical_loss": 4.97010603977933, + "tokens_seen": 75038720 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935606820461384, + "loss": 4.0499, + "theoretical_loss": 4.96948006811561, + "tokens_seen": 75104256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935506519558676, + "loss": 4.0121, + "theoretical_loss": 4.968854795227281, + "tokens_seen": 75169792 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935406218655968, + "loss": 3.986, + "theoretical_loss": 4.968230219726093, + "tokens_seen": 75235328 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493530591775326, + "loss": 4.0126, + "theoretical_loss": 4.967606340227765, + "tokens_seen": 75300864 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 151789, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7915000915527344, + "objective/train/theoretical_loss": 4.966983155351962, + "objective/train/tokens_used": 95826400, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935205616850551, + "loss": 4.0154, + "theoretical_loss": 4.966983155351962, + "tokens_seen": 75366400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935105315947844, + "loss": 3.9018, + "theoretical_loss": 4.966360663722287, + "tokens_seen": 75431936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004935005015045135, + "loss": 3.9981, + "theoretical_loss": 4.96573886396626, + "tokens_seen": 75497472 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934904714142428, + "loss": 3.9699, + "theoretical_loss": 4.965117754715307, + "tokens_seen": 75563008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934804413239719, + "loss": 3.9042, + "theoretical_loss": 4.964497334604748, + "tokens_seen": 75628544 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934704112337011, + "loss": 3.8988, + "theoretical_loss": 4.963877602273776, + "tokens_seen": 75694080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934603811434303, + "loss": 3.896, + "theoretical_loss": 4.963258556365449, + "tokens_seen": 75759616 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934503510531595, + "loss": 3.8381, + "theoretical_loss": 4.962640195526673, + "tokens_seen": 75825152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934403209628887, + "loss": 3.9705, + "theoretical_loss": 4.962022518408183, + "tokens_seen": 75890688 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934302908726179, + "loss": 3.8635, + "theoretical_loss": 4.96140552366454, + "tokens_seen": 75956224 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493420260782347, + "loss": 3.8523, + "theoretical_loss": 4.9607892099541075, + "tokens_seen": 76021760 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934102306920762, + "loss": 4.1121, + "theoretical_loss": 4.9601735759390415, + "tokens_seen": 76087296 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934002006018054, + "loss": 3.8167, + "theoretical_loss": 4.959558620285274, + "tokens_seen": 76152832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933901705115346, + "loss": 3.8953, + "theoretical_loss": 4.958944341662502, + "tokens_seen": 76218368 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933801404212638, + "loss": 3.7088, + "theoretical_loss": 4.958330738744172, + "tokens_seen": 76283904 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493370110330993, + "loss": 3.9463, + "theoretical_loss": 4.957717810207466, + "tokens_seen": 76349440 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933600802407221, + "loss": 3.7653, + "theoretical_loss": 4.957105554733289, + "tokens_seen": 76414976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933500501504513, + "loss": 3.9133, + "theoretical_loss": 4.956493971006253, + "tokens_seen": 76480512 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933400200601805, + "loss": 3.914, + "theoretical_loss": 4.955883057714669, + "tokens_seen": 76546048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933299899699098, + "loss": 3.8421, + "theoretical_loss": 4.955272813550524, + "tokens_seen": 76611584 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933199598796389, + "loss": 3.9184, + "theoretical_loss": 4.954663237209477, + "tokens_seen": 76677120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933099297893682, + "loss": 3.9241, + "theoretical_loss": 4.954054327390841, + "tokens_seen": 76742656 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932998996990972, + "loss": 3.7632, + "theoretical_loss": 4.9534460827975675, + "tokens_seen": 76808192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932898696088265, + "loss": 3.9168, + "theoretical_loss": 4.952838502136241, + "tokens_seen": 76873728 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932798395185557, + "loss": 3.9541, + "theoretical_loss": 4.952231584117056, + "tokens_seen": 76939264 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 154459, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.1298017501831055, + "objective/train/theoretical_loss": 4.951625327453812, + "objective/train/tokens_used": 97464800, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932698094282849, + "loss": 3.993, + "theoretical_loss": 4.951625327453812, + "tokens_seen": 77004800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932597793380141, + "loss": 3.9194, + "theoretical_loss": 4.951019730863894, + "tokens_seen": 77070336 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932497492477432, + "loss": 3.952, + "theoretical_loss": 4.950414793068266, + "tokens_seen": 77135872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932397191574724, + "loss": 3.887, + "theoretical_loss": 4.94981051279145, + "tokens_seen": 77201408 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932296890672016, + "loss": 3.911, + "theoretical_loss": 4.94920688876152, + "tokens_seen": 77266944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932196589769308, + "loss": 3.8329, + "theoretical_loss": 4.948603919710088, + "tokens_seen": 77332480 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049320962888666, + "loss": 3.9142, + "theoretical_loss": 4.948001604372287, + "tokens_seen": 77398016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931995987963893, + "loss": 3.9201, + "theoretical_loss": 4.947399941486762, + "tokens_seen": 77463552 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931895687061183, + "loss": 4.1649, + "theoretical_loss": 4.946798929795658, + "tokens_seen": 77529088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931795386158476, + "loss": 3.7941, + "theoretical_loss": 4.946198568044602, + "tokens_seen": 77594624 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931695085255767, + "loss": 3.9987, + "theoretical_loss": 4.945598854982698, + "tokens_seen": 77660160 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493159478435306, + "loss": 3.7635, + "theoretical_loss": 4.944999789362508, + "tokens_seen": 77725696 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931494483450352, + "loss": 3.8495, + "theoretical_loss": 4.944401369940043, + "tokens_seen": 77791232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931394182547643, + "loss": 3.7598, + "theoretical_loss": 4.9438035954747495, + "tokens_seen": 77856768 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931293881644935, + "loss": 4.003, + "theoretical_loss": 4.9432064647294975, + "tokens_seen": 77922304 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931193580742227, + "loss": 4.0136, + "theoretical_loss": 4.942609976470566, + "tokens_seen": 77987840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931093279839519, + "loss": 3.7645, + "theoretical_loss": 4.942014129467637, + "tokens_seen": 78053376 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930992978936811, + "loss": 3.7788, + "theoretical_loss": 4.941418922493774, + "tokens_seen": 78118912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930892678034102, + "loss": 3.8539, + "theoretical_loss": 4.940824354325419, + "tokens_seen": 78184448 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930792377131394, + "loss": 4.0117, + "theoretical_loss": 4.940230423742372, + "tokens_seen": 78249984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930692076228686, + "loss": 3.9347, + "theoretical_loss": 4.939637129527789, + "tokens_seen": 78315520 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930591775325978, + "loss": 3.7836, + "theoretical_loss": 4.939044470468156, + "tokens_seen": 78381056 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493049147442327, + "loss": 3.8655, + "theoretical_loss": 4.938452445353294, + "tokens_seen": 78446592 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930391173520562, + "loss": 3.7515, + "theoretical_loss": 4.937861052976332, + "tokens_seen": 78512128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930290872617853, + "loss": 3.8879, + "theoretical_loss": 4.937270292133704, + "tokens_seen": 78577664 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 157231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.148642539978027, + "objective/train/theoretical_loss": 4.9366801616251355, + "objective/train/tokens_used": 99103200, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930190571715146, + "loss": 3.9348, + "theoretical_loss": 4.9366801616251355, + "tokens_seen": 78643200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930090270812437, + "loss": 4.0004, + "theoretical_loss": 4.93609066025363, + "tokens_seen": 78708736 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492998996990973, + "loss": 3.8573, + "theoretical_loss": 4.935501786825457, + "tokens_seen": 78774272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929889669007021, + "loss": 3.7744, + "theoretical_loss": 4.934913540150143, + "tokens_seen": 78839808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929789368104313, + "loss": 3.9438, + "theoretical_loss": 4.934325919040461, + "tokens_seen": 78905344 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929689067201605, + "loss": 3.9372, + "theoretical_loss": 4.933738922312413, + "tokens_seen": 78970880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929588766298897, + "loss": 3.928, + "theoretical_loss": 4.933152548785222, + "tokens_seen": 79036416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929488465396189, + "loss": 3.6162, + "theoretical_loss": 4.932566797281324, + "tokens_seen": 79101952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929388164493481, + "loss": 3.8039, + "theoretical_loss": 4.931981666626351, + "tokens_seen": 79167488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929287863590773, + "loss": 3.9653, + "theoretical_loss": 4.931397155649121, + "tokens_seen": 79233024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929187562688064, + "loss": 3.9489, + "theoretical_loss": 4.930813263181631, + "tokens_seen": 79298560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929087261785356, + "loss": 3.8511, + "theoretical_loss": 4.93022998805904, + "tokens_seen": 79364096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928986960882648, + "loss": 3.8178, + "theoretical_loss": 4.929647329119659, + "tokens_seen": 79429632 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492888665997994, + "loss": 3.879, + "theoretical_loss": 4.9290652852049455, + "tokens_seen": 79495168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928786359077232, + "loss": 3.9235, + "theoretical_loss": 4.928483855159485, + "tokens_seen": 79560704 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928686058174523, + "loss": 3.9613, + "theoretical_loss": 4.927903037830983, + "tokens_seen": 79626240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928585757271815, + "loss": 3.8301, + "theoretical_loss": 4.9273228320702565, + "tokens_seen": 79691776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928485456369107, + "loss": 3.6294, + "theoretical_loss": 4.926743236731218, + "tokens_seen": 79757312 + }, + { + "epoch": 0.02, + "learning_rate": 0.00049283851554664, + "loss": 3.8785, + "theoretical_loss": 4.926164250670868, + "tokens_seen": 79822848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928284854563691, + "loss": 3.8887, + "theoretical_loss": 4.925585872749284, + "tokens_seen": 79888384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928184553660984, + "loss": 3.8091, + "theoretical_loss": 4.925008101829608, + "tokens_seen": 79953920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928084252758274, + "loss": 3.8664, + "theoretical_loss": 4.9244309367780374, + "tokens_seen": 80019456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927983951855567, + "loss": 3.7095, + "theoretical_loss": 4.923854376463816, + "tokens_seen": 80084992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927883650952859, + "loss": 3.6023, + "theoretical_loss": 4.923278419759217, + "tokens_seen": 80150528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927783350050151, + "loss": 4.0525, + "theoretical_loss": 4.92270306553954, + "tokens_seen": 80216064 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 158547, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5380027294158936, + "objective/train/theoretical_loss": 4.922128312683096, + "objective/train/tokens_used": 100741600, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927683049147443, + "loss": 3.8218, + "theoretical_loss": 4.922128312683096, + "tokens_seen": 80281600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927582748244734, + "loss": 3.8064, + "theoretical_loss": 4.921554160071194, + "tokens_seen": 80347136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927482447342026, + "loss": 3.8461, + "theoretical_loss": 4.920980606588142, + "tokens_seen": 80412672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927382146439318, + "loss": 3.9728, + "theoretical_loss": 4.920407651121222, + "tokens_seen": 80478208 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492728184553661, + "loss": 3.686, + "theoretical_loss": 4.919835292560689, + "tokens_seen": 80543744 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927181544633902, + "loss": 3.7003, + "theoretical_loss": 4.919263529799759, + "tokens_seen": 80609280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927081243731193, + "loss": 3.863, + "theoretical_loss": 4.918692361734598, + "tokens_seen": 80674816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926980942828485, + "loss": 3.9446, + "theoretical_loss": 4.91812178726431, + "tokens_seen": 80740352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926880641925777, + "loss": 3.8734, + "theoretical_loss": 4.917551805290929, + "tokens_seen": 80805888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926780341023069, + "loss": 3.5919, + "theoretical_loss": 4.916982414719408, + "tokens_seen": 80871424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926680040120361, + "loss": 3.9023, + "theoretical_loss": 4.9164136144576105, + "tokens_seen": 80936960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926579739217654, + "loss": 4.0239, + "theoretical_loss": 4.915845403416299, + "tokens_seen": 81002496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926479438314944, + "loss": 3.8703, + "theoretical_loss": 4.915277780509124, + "tokens_seen": 81068032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926379137412237, + "loss": 3.8956, + "theoretical_loss": 4.914710744652614, + "tokens_seen": 81133568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926278836509528, + "loss": 3.9682, + "theoretical_loss": 4.914144294766169, + "tokens_seen": 81199104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926178535606821, + "loss": 3.851, + "theoretical_loss": 4.913578429772047, + "tokens_seen": 81264640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926078234704113, + "loss": 4.0165, + "theoretical_loss": 4.913013148595355, + "tokens_seen": 81330176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925977933801404, + "loss": 3.9144, + "theoretical_loss": 4.912448450164041, + "tokens_seen": 81395712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925877632898696, + "loss": 3.9316, + "theoretical_loss": 4.91188433340888, + "tokens_seen": 81461248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925777331995988, + "loss": 4.0616, + "theoretical_loss": 4.911320797263471, + "tokens_seen": 81526784 + }, + { + "epoch": 0.02, + "learning_rate": 0.000492567703109328, + "loss": 3.874, + "theoretical_loss": 4.910757840664219, + "tokens_seen": 81592320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925576730190572, + "loss": 3.8266, + "theoretical_loss": 4.910195462550334, + "tokens_seen": 81657856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925476429287864, + "loss": 3.8151, + "theoretical_loss": 4.909633661863811, + "tokens_seen": 81723392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925376128385155, + "loss": 3.6298, + "theoretical_loss": 4.909072437549434, + "tokens_seen": 81788928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925275827482447, + "loss": 3.8972, + "theoretical_loss": 4.908511788554753, + "tokens_seen": 81854464 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 161230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.769374132156372, + "objective/train/theoretical_loss": 4.907951713830082, + "objective/train/tokens_used": 102380000, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925175526579739, + "loss": 3.745, + "theoretical_loss": 4.907951713830082, + "tokens_seen": 81920000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925075225677031, + "loss": 3.7704, + "theoretical_loss": 4.907392212328489, + "tokens_seen": 81985536 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924974924774323, + "loss": 3.8973, + "theoretical_loss": 4.906833283005785, + "tokens_seen": 82051072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924874623871615, + "loss": 3.9124, + "theoretical_loss": 4.906274924820515, + "tokens_seen": 82116608 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924774322968907, + "loss": 3.8759, + "theoretical_loss": 4.90571713673395, + "tokens_seen": 82182144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924674022066199, + "loss": 3.8056, + "theoretical_loss": 4.905159917710073, + "tokens_seen": 82247680 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924573721163491, + "loss": 3.8848, + "theoretical_loss": 4.904603266715578, + "tokens_seen": 82313216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924473420260783, + "loss": 3.8402, + "theoretical_loss": 4.904047182719854, + "tokens_seen": 82378752 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004924373119358075, + "loss": 3.635, + "theoretical_loss": 4.903491664694977, + "tokens_seen": 82444288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924272818455366, + "loss": 3.8816, + "theoretical_loss": 4.902936711615702, + "tokens_seen": 82509824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924172517552658, + "loss": 3.984, + "theoretical_loss": 4.902382322459456, + "tokens_seen": 82575360 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492407221664995, + "loss": 3.7499, + "theoretical_loss": 4.901828496206322, + "tokens_seen": 82640896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923971915747242, + "loss": 3.8105, + "theoretical_loss": 4.90127523183904, + "tokens_seen": 82706432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923871614844534, + "loss": 3.8429, + "theoretical_loss": 4.900722528342988, + "tokens_seen": 82771968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923771313941825, + "loss": 3.6828, + "theoretical_loss": 4.900170384706181, + "tokens_seen": 82837504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923671013039117, + "loss": 3.8442, + "theoretical_loss": 4.899618799919256, + "tokens_seen": 82903040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923570712136409, + "loss": 3.7676, + "theoretical_loss": 4.899067772975469, + "tokens_seen": 82968576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923470411233702, + "loss": 3.8443, + "theoretical_loss": 4.898517302870679, + "tokens_seen": 83034112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923370110330993, + "loss": 3.8722, + "theoretical_loss": 4.897967388603346, + "tokens_seen": 83099648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923269809428286, + "loss": 3.8633, + "theoretical_loss": 4.897418029174519, + "tokens_seen": 83165184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923169508525576, + "loss": 3.7328, + "theoretical_loss": 4.896869223587828, + "tokens_seen": 83230720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923069207622869, + "loss": 3.7986, + "theoretical_loss": 4.896320970849472, + "tokens_seen": 83296256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922968906720161, + "loss": 3.9642, + "theoretical_loss": 4.895773269968219, + "tokens_seen": 83361792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922868605817453, + "loss": 3.8536, + "theoretical_loss": 4.895226119955386, + "tokens_seen": 83427328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922768304914745, + "loss": 3.9965, + "theoretical_loss": 4.894679519824841, + "tokens_seen": 83492864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 164075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 4.0510573387146, + "objective/train/theoretical_loss": 4.894133468592984, + "objective/train/tokens_used": 104018400, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922668004012036, + "loss": 3.8448, + "theoretical_loss": 4.894133468592984, + "tokens_seen": 83558400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922567703109328, + "loss": 3.738, + "theoretical_loss": 4.8935879652787495, + "tokens_seen": 83623936 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492246740220662, + "loss": 3.9158, + "theoretical_loss": 4.893043008903591, + "tokens_seen": 83689472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922367101303912, + "loss": 3.8833, + "theoretical_loss": 4.892498598491473, + "tokens_seen": 83755008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922266800401204, + "loss": 3.7888, + "theoretical_loss": 4.891954733068863, + "tokens_seen": 83820544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922166499498495, + "loss": 3.9444, + "theoretical_loss": 4.891411411664727, + "tokens_seen": 83886080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004922066198595787, + "loss": 3.8909, + "theoretical_loss": 4.890868633310515, + "tokens_seen": 83951616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921965897693079, + "loss": 3.6118, + "theoretical_loss": 4.890326397040158, + "tokens_seen": 84017152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921865596790371, + "loss": 3.9341, + "theoretical_loss": 4.889784701890056, + "tokens_seen": 84082688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921765295887663, + "loss": 3.7638, + "theoretical_loss": 4.8892435468990705, + "tokens_seen": 84148224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921664994984956, + "loss": 3.8125, + "theoretical_loss": 4.88870293110852, + "tokens_seen": 84213760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921564694082246, + "loss": 3.9401, + "theoretical_loss": 4.888162853562166, + "tokens_seen": 84279296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921464393179539, + "loss": 3.9121, + "theoretical_loss": 4.88762331330621, + "tokens_seen": 84344832 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492136409227683, + "loss": 3.8526, + "theoretical_loss": 4.88708430938928, + "tokens_seen": 84410368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921263791374123, + "loss": 3.9408, + "theoretical_loss": 4.8865458408624285, + "tokens_seen": 84475904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921163490471415, + "loss": 3.725, + "theoretical_loss": 4.8860079067791204, + "tokens_seen": 84541440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921063189568706, + "loss": 3.9026, + "theoretical_loss": 4.885470506195227, + "tokens_seen": 84606976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920962888665998, + "loss": 3.8294, + "theoretical_loss": 4.884933638169014, + "tokens_seen": 84672512 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492086258776329, + "loss": 3.6878, + "theoretical_loss": 4.88439730176114, + "tokens_seen": 84738048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920762286860582, + "loss": 3.8973, + "theoretical_loss": 4.883861496034644, + "tokens_seen": 84803584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920661985957874, + "loss": 3.8531, + "theoretical_loss": 4.88332622005494, + "tokens_seen": 84869120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920561685055166, + "loss": 3.7737, + "theoretical_loss": 4.8827914728898065, + "tokens_seen": 84934656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920461384152457, + "loss": 3.8671, + "theoretical_loss": 4.88225725360938, + "tokens_seen": 85000192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920361083249749, + "loss": 3.9901, + "theoretical_loss": 4.881723561286149, + "tokens_seen": 85065728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920260782347041, + "loss": 3.7908, + "theoretical_loss": 4.881190394994943, + "tokens_seen": 85131264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 167121, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.860599994659424, + "objective/train/theoretical_loss": 4.880657753812926, + "objective/train/tokens_used": 105656800, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920160481444333, + "loss": 3.7432, + "theoretical_loss": 4.880657753812926, + "tokens_seen": 85196800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004920060180541625, + "loss": 3.8309, + "theoretical_loss": 4.880125636819594, + "tokens_seen": 85262336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919959879638916, + "loss": 3.9492, + "theoretical_loss": 4.879594043096755, + "tokens_seen": 85327872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919859578736209, + "loss": 3.9316, + "theoretical_loss": 4.879062971728534, + "tokens_seen": 85393408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049197592778335, + "loss": 3.9782, + "theoretical_loss": 4.87853242180136, + "tokens_seen": 85458944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919658976930793, + "loss": 3.7071, + "theoretical_loss": 4.878002392403959, + "tokens_seen": 85524480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919558676028084, + "loss": 3.8632, + "theoretical_loss": 4.877472882627343, + "tokens_seen": 85590016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919458375125377, + "loss": 3.8181, + "theoretical_loss": 4.8769438915648085, + "tokens_seen": 85655552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919358074222668, + "loss": 3.5253, + "theoretical_loss": 4.876415418311928, + "tokens_seen": 85721088 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491925777331996, + "loss": 3.8982, + "theoretical_loss": 4.875887461966537, + "tokens_seen": 85786624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919157472417252, + "loss": 3.6946, + "theoretical_loss": 4.875360021628733, + "tokens_seen": 85852160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919057171514544, + "loss": 3.7098, + "theoretical_loss": 4.874833096400865, + "tokens_seen": 85917696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918956870611836, + "loss": 3.8437, + "theoretical_loss": 4.874306685387525, + "tokens_seen": 85983232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918856569709127, + "loss": 3.911, + "theoretical_loss": 4.873780787695547, + "tokens_seen": 86048768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918756268806419, + "loss": 3.8862, + "theoretical_loss": 4.87325540243399, + "tokens_seen": 86114304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918655967903711, + "loss": 3.7524, + "theoretical_loss": 4.872730528714139, + "tokens_seen": 86179840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918555667001003, + "loss": 3.783, + "theoretical_loss": 4.872206165649493, + "tokens_seen": 86245376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918455366098295, + "loss": 3.8322, + "theoretical_loss": 4.871682312355761, + "tokens_seen": 86310912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918355065195586, + "loss": 3.9636, + "theoretical_loss": 4.871158967950852, + "tokens_seen": 86376448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918254764292878, + "loss": 3.8401, + "theoretical_loss": 4.870636131554869, + "tokens_seen": 86441984 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491815446339017, + "loss": 3.8193, + "theoretical_loss": 4.8701138022901045, + "tokens_seen": 86507520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918054162487463, + "loss": 3.7125, + "theoretical_loss": 4.869591979281028, + "tokens_seen": 86573056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917953861584754, + "loss": 3.7937, + "theoretical_loss": 4.8690706616542805, + "tokens_seen": 86638592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917853560682047, + "loss": 3.7368, + "theoretical_loss": 4.868549848538675, + "tokens_seen": 86704128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917753259779337, + "loss": 3.9825, + "theoretical_loss": 4.868029539065176, + "tokens_seen": 86769664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 170013, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.9133307933807373, + "objective/train/theoretical_loss": 4.867509732366907, + "objective/train/tokens_used": 107295200, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491765295887663, + "loss": 3.8537, + "theoretical_loss": 4.867509732366907, + "tokens_seen": 86835200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917552657973922, + "loss": 3.7893, + "theoretical_loss": 4.866990427579129, + "tokens_seen": 86900736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917452357071214, + "loss": 3.6738, + "theoretical_loss": 4.866471623839248, + "tokens_seen": 86966272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917352056168506, + "loss": 3.821, + "theoretical_loss": 4.8659533202867955, + "tokens_seen": 87031808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917251755265797, + "loss": 3.6539, + "theoretical_loss": 4.86543551606343, + "tokens_seen": 87097344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917151454363089, + "loss": 3.7662, + "theoretical_loss": 4.864918210312927, + "tokens_seen": 87162880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917051153460381, + "loss": 3.9022, + "theoretical_loss": 4.864401402181173, + "tokens_seen": 87228416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916950852557673, + "loss": 3.8134, + "theoretical_loss": 4.863885090816158, + "tokens_seen": 87293952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916850551654965, + "loss": 3.9001, + "theoretical_loss": 4.863369275367968, + "tokens_seen": 87359488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916750250752258, + "loss": 3.7364, + "theoretical_loss": 4.862853954988781, + "tokens_seen": 87425024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916649949849548, + "loss": 4.0256, + "theoretical_loss": 4.862339128832857, + "tokens_seen": 87490560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916549648946841, + "loss": 3.8218, + "theoretical_loss": 4.861824796056533, + "tokens_seen": 87556096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916449348044132, + "loss": 3.8639, + "theoretical_loss": 4.861310955818219, + "tokens_seen": 87621632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916349047141425, + "loss": 3.7911, + "theoretical_loss": 4.860797607278385, + "tokens_seen": 87687168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916248746238717, + "loss": 3.5873, + "theoretical_loss": 4.86028474959956, + "tokens_seen": 87752704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916148445336008, + "loss": 3.6722, + "theoretical_loss": 4.859772381946323, + "tokens_seen": 87818240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049160481444333, + "loss": 3.7202, + "theoretical_loss": 4.859260503485298, + "tokens_seen": 87883776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915947843530592, + "loss": 3.8672, + "theoretical_loss": 4.858749113385144, + "tokens_seen": 87949312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915847542627884, + "loss": 3.9241, + "theoretical_loss": 4.858238210816554, + "tokens_seen": 88014848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915747241725176, + "loss": 3.652, + "theoretical_loss": 4.8577277949522415, + "tokens_seen": 88080384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915646940822468, + "loss": 3.91, + "theoretical_loss": 4.857217864966943, + "tokens_seen": 88145920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915546639919759, + "loss": 3.8022, + "theoretical_loss": 4.856708420037402, + "tokens_seen": 88211456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915446339017051, + "loss": 3.8488, + "theoretical_loss": 4.8561994593423705, + "tokens_seen": 88276992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915346038114343, + "loss": 3.9325, + "theoretical_loss": 4.8556909820625975, + "tokens_seen": 88342528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915245737211635, + "loss": 3.6196, + "theoretical_loss": 4.855182987380823, + "tokens_seen": 88408064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 171411, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.794614315032959, + "objective/train/theoretical_loss": 4.854675474481779, + "objective/train/tokens_used": 108933600, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915145436308927, + "loss": 3.8269, + "theoretical_loss": 4.854675474481779, + "tokens_seen": 88473600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915045135406218, + "loss": 3.8042, + "theoretical_loss": 4.8541684425521705, + "tokens_seen": 88539136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914944834503511, + "loss": 3.8976, + "theoretical_loss": 4.85366189078068, + "tokens_seen": 88604672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914844533600802, + "loss": 3.7359, + "theoretical_loss": 4.853155818357957, + "tokens_seen": 88670208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914744232698095, + "loss": 3.7995, + "theoretical_loss": 4.852650224476609, + "tokens_seen": 88735744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914643931795386, + "loss": 3.6092, + "theoretical_loss": 4.852145108331205, + "tokens_seen": 88801280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914543630892679, + "loss": 3.8474, + "theoretical_loss": 4.851640469118255, + "tokens_seen": 88866816 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491444332998997, + "loss": 3.7632, + "theoretical_loss": 4.851136306036219, + "tokens_seen": 88932352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914343029087262, + "loss": 3.8003, + "theoretical_loss": 4.850632618285486, + "tokens_seen": 88997888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914242728184554, + "loss": 3.7013, + "theoretical_loss": 4.850129405068383, + "tokens_seen": 89063424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914142427281846, + "loss": 3.882, + "theoretical_loss": 4.849626665589156, + "tokens_seen": 89128960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914042126379138, + "loss": 3.8358, + "theoretical_loss": 4.849124399053969, + "tokens_seen": 89194496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913941825476429, + "loss": 3.8596, + "theoretical_loss": 4.8486226046709024, + "tokens_seen": 89260032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913841524573721, + "loss": 3.8279, + "theoretical_loss": 4.8481212816499415, + "tokens_seen": 89325568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913741223671013, + "loss": 3.8282, + "theoretical_loss": 4.847620429202967, + "tokens_seen": 89391104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913640922768305, + "loss": 3.9284, + "theoretical_loss": 4.847120046543763, + "tokens_seen": 89456640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913540621865597, + "loss": 3.9146, + "theoretical_loss": 4.846620132887992, + "tokens_seen": 89522176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913440320962888, + "loss": 4.0135, + "theoretical_loss": 4.8461206874532055, + "tokens_seen": 89587712 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491334002006018, + "loss": 3.7788, + "theoretical_loss": 4.845621709458831, + "tokens_seen": 89653248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913239719157472, + "loss": 3.6995, + "theoretical_loss": 4.845123198126162, + "tokens_seen": 89718784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913139418254765, + "loss": 3.7693, + "theoretical_loss": 4.844625152678364, + "tokens_seen": 89784320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004913039117352056, + "loss": 3.9621, + "theoretical_loss": 4.844127572340455, + "tokens_seen": 89849856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912938816449349, + "loss": 3.9589, + "theoretical_loss": 4.84363045633931, + "tokens_seen": 89915392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912838515546639, + "loss": 3.6493, + "theoretical_loss": 4.843133803903651, + "tokens_seen": 89980928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912738214643932, + "loss": 3.617, + "theoretical_loss": 4.84263761426404, + "tokens_seen": 90046464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 174465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.915762424468994, + "objective/train/theoretical_loss": 4.842141886652876, + "objective/train/tokens_used": 110572000, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912637913741224, + "loss": 3.8061, + "theoretical_loss": 4.842141886652876, + "tokens_seen": 90112000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912537612838516, + "loss": 3.8193, + "theoretical_loss": 4.841646620304388, + "tokens_seen": 90177536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912437311935808, + "loss": 3.8763, + "theoretical_loss": 4.841151814454632, + "tokens_seen": 90243072 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049123370110331, + "loss": 3.7813, + "theoretical_loss": 4.840657468341476, + "tokens_seen": 90308608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912236710130391, + "loss": 3.705, + "theoretical_loss": 4.84016358120461, + "tokens_seen": 90374144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912136409227683, + "loss": 3.8257, + "theoretical_loss": 4.839670152285526, + "tokens_seen": 90439680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912036108324975, + "loss": 3.6166, + "theoretical_loss": 4.8391771808275195, + "tokens_seen": 90505216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911935807422267, + "loss": 3.7789, + "theoretical_loss": 4.838684666075682, + "tokens_seen": 90570752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911835506519559, + "loss": 3.8341, + "theoretical_loss": 4.838192607276896, + "tokens_seen": 90636288 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491173520561685, + "loss": 3.6441, + "theoretical_loss": 4.837701003679829, + "tokens_seen": 90701824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911634904714142, + "loss": 3.6792, + "theoretical_loss": 4.8372098545349305, + "tokens_seen": 90767360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911534603811434, + "loss": 3.7727, + "theoretical_loss": 4.836719159094422, + "tokens_seen": 90832896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911434302908726, + "loss": 3.9573, + "theoretical_loss": 4.836228916612292, + "tokens_seen": 90898432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911334002006019, + "loss": 3.7081, + "theoretical_loss": 4.835739126344298, + "tokens_seen": 90963968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911233701103309, + "loss": 3.7443, + "theoretical_loss": 4.8352497875479505, + "tokens_seen": 91029504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911133400200602, + "loss": 3.6924, + "theoretical_loss": 4.834760899482514, + "tokens_seen": 91095040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911033099297893, + "loss": 3.5426, + "theoretical_loss": 4.834272461409001, + "tokens_seen": 91160576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910932798395186, + "loss": 3.956, + "theoretical_loss": 4.833784472590165, + "tokens_seen": 91226112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910832497492478, + "loss": 3.6949, + "theoretical_loss": 4.833296932290495, + "tokens_seen": 91291648 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491073219658977, + "loss": 3.7337, + "theoretical_loss": 4.832809839776213, + "tokens_seen": 91357184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910631895687061, + "loss": 3.8472, + "theoretical_loss": 4.832323194315265, + "tokens_seen": 91422720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910531594784353, + "loss": 3.8097, + "theoretical_loss": 4.831836995177319, + "tokens_seen": 91488256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910431293881645, + "loss": 3.6634, + "theoretical_loss": 4.831351241633756, + "tokens_seen": 91553792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910330992978937, + "loss": 3.8366, + "theoretical_loss": 4.8308659329576695, + "tokens_seen": 91619328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910230692076229, + "loss": 3.9118, + "theoretical_loss": 4.830381068423856, + "tokens_seen": 91684864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 177483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7296459674835205, + "objective/train/theoretical_loss": 4.8298966473088125, + "objective/train/tokens_used": 112210400, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000491013039117352, + "loss": 3.8564, + "theoretical_loss": 4.8298966473088125, + "tokens_seen": 91750400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910030090270812, + "loss": 3.7379, + "theoretical_loss": 4.829412668890729, + "tokens_seen": 91815936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909929789368104, + "loss": 3.7511, + "theoretical_loss": 4.8289291324494865, + "tokens_seen": 91881472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909829488465397, + "loss": 3.766, + "theoretical_loss": 4.828446037266647, + "tokens_seen": 91947008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909729187562688, + "loss": 3.6744, + "theoretical_loss": 4.827963382625454, + "tokens_seen": 92012544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909628886659981, + "loss": 3.872, + "theoretical_loss": 4.827481167810825, + "tokens_seen": 92078080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909528585757272, + "loss": 3.7354, + "theoretical_loss": 4.826999392109344, + "tokens_seen": 92143616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909428284854564, + "loss": 3.8259, + "theoretical_loss": 4.826518054809259, + "tokens_seen": 92209152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909327983951856, + "loss": 3.8298, + "theoretical_loss": 4.826037155200478, + "tokens_seen": 92274688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909227683049148, + "loss": 3.7613, + "theoretical_loss": 4.825556692574562, + "tokens_seen": 92340224 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490912738214644, + "loss": 3.7828, + "theoretical_loss": 4.825076666224717, + "tokens_seen": 92405760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909027081243731, + "loss": 3.6679, + "theoretical_loss": 4.824597075445799, + "tokens_seen": 92471296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908926780341023, + "loss": 3.7271, + "theoretical_loss": 4.824117919534297, + "tokens_seen": 92536832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908826479438315, + "loss": 3.8421, + "theoretical_loss": 4.823639197788334, + "tokens_seen": 92602368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908726178535607, + "loss": 3.8383, + "theoretical_loss": 4.823160909507665, + "tokens_seen": 92667904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908625877632899, + "loss": 3.7335, + "theoretical_loss": 4.822683053993664, + "tokens_seen": 92733440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490852557673019, + "loss": 3.8851, + "theoretical_loss": 4.822205630549329, + "tokens_seen": 92798976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908425275827482, + "loss": 3.671, + "theoretical_loss": 4.821728638479267, + "tokens_seen": 92864512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908324974924774, + "loss": 3.8826, + "theoretical_loss": 4.821252077089696, + "tokens_seen": 92930048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908224674022067, + "loss": 3.7597, + "theoretical_loss": 4.820775945688437, + "tokens_seen": 92995584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908124373119358, + "loss": 3.8076, + "theoretical_loss": 4.820300243584913, + "tokens_seen": 93061120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908024072216651, + "loss": 3.6756, + "theoretical_loss": 4.819824970090138, + "tokens_seen": 93126656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907923771313941, + "loss": 3.7671, + "theoretical_loss": 4.819350124516717, + "tokens_seen": 93192192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907823470411234, + "loss": 3.8059, + "theoretical_loss": 4.818875706178841, + "tokens_seen": 93257728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907723169508526, + "loss": 3.9065, + "theoretical_loss": 4.818401714392279, + "tokens_seen": 93323264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 180773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6933140754699707, + "objective/train/theoretical_loss": 4.817928148474378, + "objective/train/tokens_used": 113848800, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907622868605818, + "loss": 3.7297, + "theoretical_loss": 4.817928148474378, + "tokens_seen": 93388800 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490752256770311, + "loss": 3.7872, + "theoretical_loss": 4.817455007744052, + "tokens_seen": 93454336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907422266800401, + "loss": 3.7511, + "theoretical_loss": 4.816982291521785, + "tokens_seen": 93519872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907321965897693, + "loss": 3.6415, + "theoretical_loss": 4.816509999129618, + "tokens_seen": 93585408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907221664994985, + "loss": 3.7206, + "theoretical_loss": 4.816038129891151, + "tokens_seen": 93650944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907121364092277, + "loss": 3.717, + "theoretical_loss": 4.815566683131536, + "tokens_seen": 93716480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004907021063189569, + "loss": 3.6926, + "theoretical_loss": 4.815095658177472, + "tokens_seen": 93782016 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490692076228686, + "loss": 3.7171, + "theoretical_loss": 4.814625054357199, + "tokens_seen": 93847552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906820461384152, + "loss": 3.7838, + "theoretical_loss": 4.814154871000497, + "tokens_seen": 93913088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906720160481444, + "loss": 3.759, + "theoretical_loss": 4.813685107438679, + "tokens_seen": 93978624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906619859578736, + "loss": 3.6714, + "theoretical_loss": 4.813215763004585, + "tokens_seen": 94044160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906519558676028, + "loss": 3.5838, + "theoretical_loss": 4.812746837032582, + "tokens_seen": 94109696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906419257773321, + "loss": 3.8207, + "theoretical_loss": 4.812278328858554, + "tokens_seen": 94175232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906318956870611, + "loss": 3.8225, + "theoretical_loss": 4.811810237819904, + "tokens_seen": 94240768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906218655967904, + "loss": 3.9317, + "theoretical_loss": 4.81134256325554, + "tokens_seen": 94306304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906118355065195, + "loss": 3.759, + "theoretical_loss": 4.810875304505881, + "tokens_seen": 94371840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906018054162488, + "loss": 3.7607, + "theoretical_loss": 4.810408460912846, + "tokens_seen": 94437376 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490591775325978, + "loss": 3.7085, + "theoretical_loss": 4.809942031819853, + "tokens_seen": 94502912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905817452357072, + "loss": 3.6495, + "theoretical_loss": 4.809476016571809, + "tokens_seen": 94568448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905717151454363, + "loss": 3.9125, + "theoretical_loss": 4.809010414515113, + "tokens_seen": 94633984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905616850551655, + "loss": 3.9224, + "theoretical_loss": 4.808545224997644, + "tokens_seen": 94699520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905516549648947, + "loss": 3.8819, + "theoretical_loss": 4.808080447368766, + "tokens_seen": 94765056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905416248746239, + "loss": 3.6431, + "theoretical_loss": 4.807616080979315, + "tokens_seen": 94830592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905315947843531, + "loss": 3.7594, + "theoretical_loss": 4.807152125181597, + "tokens_seen": 94896128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905215646940822, + "loss": 3.7619, + "theoretical_loss": 4.806688579329387, + "tokens_seen": 94961664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 182208, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.618969440460205, + "objective/train/theoretical_loss": 4.8062254427779205, + "objective/train/tokens_used": 115487200, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905115346038114, + "loss": 3.7315, + "theoretical_loss": 4.8062254427779205, + "tokens_seen": 95027200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004905015045135406, + "loss": 3.7265, + "theoretical_loss": 4.80576271488389, + "tokens_seen": 95092736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904914744232698, + "loss": 3.7591, + "theoretical_loss": 4.805300395005444, + "tokens_seen": 95158272 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490481444332999, + "loss": 3.769, + "theoretical_loss": 4.804838482502181, + "tokens_seen": 95223808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904714142427281, + "loss": 3.7477, + "theoretical_loss": 4.8043769767351385, + "tokens_seen": 95289344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904613841524574, + "loss": 3.8054, + "theoretical_loss": 4.8039158770668005, + "tokens_seen": 95354880 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904513540621865, + "loss": 3.7387, + "theoretical_loss": 4.803455182861087, + "tokens_seen": 95420416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904413239719158, + "loss": 3.9279, + "theoretical_loss": 4.802994893483348, + "tokens_seen": 95485952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904312938816449, + "loss": 3.7263, + "theoretical_loss": 4.802535008300364, + "tokens_seen": 95551488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904212637913742, + "loss": 3.6386, + "theoretical_loss": 4.802075526680335, + "tokens_seen": 95617024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904112337011033, + "loss": 3.7494, + "theoretical_loss": 4.801616447992888, + "tokens_seen": 95682560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904012036108325, + "loss": 3.8661, + "theoretical_loss": 4.801157771609061, + "tokens_seen": 95748096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903911735205617, + "loss": 3.7003, + "theoretical_loss": 4.8006994969013, + "tokens_seen": 95813632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903811434302909, + "loss": 3.7686, + "theoretical_loss": 4.800241623243467, + "tokens_seen": 95879168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903711133400201, + "loss": 3.8712, + "theoretical_loss": 4.799784150010819, + "tokens_seen": 95944704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903610832497492, + "loss": 3.8371, + "theoretical_loss": 4.799327076580017, + "tokens_seen": 96010240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903510531594784, + "loss": 4.0236, + "theoretical_loss": 4.798870402329115, + "tokens_seen": 96075776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903410230692076, + "loss": 3.6967, + "theoretical_loss": 4.798414126637558, + "tokens_seen": 96141312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903309929789368, + "loss": 3.8331, + "theoretical_loss": 4.797958248886179, + "tokens_seen": 96206848 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490320962888666, + "loss": 3.804, + "theoretical_loss": 4.797502768457193, + "tokens_seen": 96272384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903109327983952, + "loss": 3.8146, + "theoretical_loss": 4.797047684734192, + "tokens_seen": 96337920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903009027081243, + "loss": 3.7156, + "theoretical_loss": 4.796592997102147, + "tokens_seen": 96403456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902908726178535, + "loss": 3.6709, + "theoretical_loss": 4.796138704947397, + "tokens_seen": 96468992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902808425275828, + "loss": 3.7076, + "theoretical_loss": 4.795684807657649, + "tokens_seen": 96534528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902708124373119, + "loss": 3.6201, + "theoretical_loss": 4.795231304621968, + "tokens_seen": 96600064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 184806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7147223949432373, + "objective/train/theoretical_loss": 4.794778195230787, + "objective/train/tokens_used": 117125600, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902607823470412, + "loss": 3.764, + "theoretical_loss": 4.794778195230787, + "tokens_seen": 96665600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902507522567703, + "loss": 3.7941, + "theoretical_loss": 4.794325478875885, + "tokens_seen": 96731136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902407221664995, + "loss": 3.6837, + "theoretical_loss": 4.793873154950399, + "tokens_seen": 96796672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902306920762287, + "loss": 3.6034, + "theoretical_loss": 4.793421222848808, + "tokens_seen": 96862208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902206619859579, + "loss": 3.665, + "theoretical_loss": 4.7929696819669365, + "tokens_seen": 96927744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902106318956871, + "loss": 3.7708, + "theoretical_loss": 4.792518531701948, + "tokens_seen": 96993280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902006018054163, + "loss": 3.6515, + "theoretical_loss": 4.792067771452341, + "tokens_seen": 97058816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901905717151454, + "loss": 3.6389, + "theoretical_loss": 4.791617400617948, + "tokens_seen": 97124352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901805416248746, + "loss": 3.8202, + "theoretical_loss": 4.791167418599925, + "tokens_seen": 97189888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901705115346038, + "loss": 3.627, + "theoretical_loss": 4.790717824800755, + "tokens_seen": 97255424 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490160481444333, + "loss": 3.7503, + "theoretical_loss": 4.790268618624239, + "tokens_seen": 97320960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901504513540623, + "loss": 3.8389, + "theoretical_loss": 4.789819799475499, + "tokens_seen": 97386496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901404212637913, + "loss": 3.7473, + "theoretical_loss": 4.789371366760961, + "tokens_seen": 97452032 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901303911735206, + "loss": 3.7775, + "theoretical_loss": 4.788923319888369, + "tokens_seen": 97517568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901203610832497, + "loss": 3.6673, + "theoretical_loss": 4.788475658266766, + "tokens_seen": 97583104 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490110330992979, + "loss": 3.9773, + "theoretical_loss": 4.788028381306497, + "tokens_seen": 97648640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901003009027082, + "loss": 3.8573, + "theoretical_loss": 4.787581488419207, + "tokens_seen": 97714176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900902708124374, + "loss": 3.8131, + "theoretical_loss": 4.787134979017832, + "tokens_seen": 97779712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900802407221665, + "loss": 3.7261, + "theoretical_loss": 4.786688852516599, + "tokens_seen": 97845248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900702106318957, + "loss": 3.6487, + "theoretical_loss": 4.786243108331024, + "tokens_seen": 97910784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900601805416249, + "loss": 3.7621, + "theoretical_loss": 4.7857977458779, + "tokens_seen": 97976320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900501504513541, + "loss": 3.9348, + "theoretical_loss": 4.785352764575304, + "tokens_seen": 98041856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900401203610833, + "loss": 3.6737, + "theoretical_loss": 4.784908163842585, + "tokens_seen": 98107392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900300902708124, + "loss": 3.6835, + "theoretical_loss": 4.784463943100367, + "tokens_seen": 98172928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900200601805416, + "loss": 3.8009, + "theoretical_loss": 4.7840201017705395, + "tokens_seen": 98238464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 187560, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1904149055480957, + "objective/train/theoretical_loss": 4.783576639276257, + "objective/train/tokens_used": 118764000, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900100300902708, + "loss": 3.6708, + "theoretical_loss": 4.783576639276257, + "tokens_seen": 98304000 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049, + "loss": 3.7385, + "theoretical_loss": 4.783133555041934, + "tokens_seen": 98369536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899899699097292, + "loss": 3.7634, + "theoretical_loss": 4.782690848493245, + "tokens_seen": 98435072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899799398194583, + "loss": 3.7789, + "theoretical_loss": 4.7822485190571165, + "tokens_seen": 98500608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899699097291876, + "loss": 3.6162, + "theoretical_loss": 4.781806566161723, + "tokens_seen": 98566144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899598796389167, + "loss": 3.6022, + "theoretical_loss": 4.781364989236488, + "tokens_seen": 98631680 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489949849548646, + "loss": 3.874, + "theoretical_loss": 4.78092378771208, + "tokens_seen": 98697216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899398194583751, + "loss": 3.7346, + "theoretical_loss": 4.780482961020402, + "tokens_seen": 98762752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899297893681044, + "loss": 3.9051, + "theoretical_loss": 4.780042508594596, + "tokens_seen": 98828288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899197592778335, + "loss": 3.7684, + "theoretical_loss": 4.779602429869035, + "tokens_seen": 98893824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899097291875627, + "loss": 3.6814, + "theoretical_loss": 4.779162724279324, + "tokens_seen": 98959360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898996990972919, + "loss": 3.7042, + "theoretical_loss": 4.7787233912622895, + "tokens_seen": 99024896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898896690070211, + "loss": 3.7043, + "theoretical_loss": 4.778284430255981, + "tokens_seen": 99090432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898796389167503, + "loss": 3.8086, + "theoretical_loss": 4.77784584069967, + "tokens_seen": 99155968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898696088264794, + "loss": 3.6726, + "theoretical_loss": 4.777407622033838, + "tokens_seen": 99221504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898595787362086, + "loss": 3.8388, + "theoretical_loss": 4.776969773700181, + "tokens_seen": 99287040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898495486459378, + "loss": 3.8056, + "theoretical_loss": 4.776532295141601, + "tokens_seen": 99352576 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489839518555667, + "loss": 3.6963, + "theoretical_loss": 4.776095185802211, + "tokens_seen": 99418112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898294884653962, + "loss": 3.608, + "theoretical_loss": 4.775658445127318, + "tokens_seen": 99483648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898194583751254, + "loss": 3.6239, + "theoretical_loss": 4.775222072563429, + "tokens_seen": 99549184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898094282848545, + "loss": 3.6802, + "theoretical_loss": 4.7747860675582485, + "tokens_seen": 99614720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897993981945837, + "loss": 3.5483, + "theoretical_loss": 4.77435042956067, + "tokens_seen": 99680256 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489789368104313, + "loss": 3.7372, + "theoretical_loss": 4.773915158020776, + "tokens_seen": 99745792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897793380140421, + "loss": 3.7197, + "theoretical_loss": 4.773480252389831, + "tokens_seen": 99811328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897693079237714, + "loss": 3.7268, + "theoretical_loss": 4.773045712120284, + "tokens_seen": 99876864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 190327, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.753544330596924, + "objective/train/theoretical_loss": 4.77261153666576, + "objective/train/tokens_used": 120402400, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897592778335005, + "loss": 3.7631, + "theoretical_loss": 4.77261153666576, + "tokens_seen": 99942400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897492477432297, + "loss": 3.5417, + "theoretical_loss": 4.772177725481062, + "tokens_seen": 100007936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897392176529589, + "loss": 3.9256, + "theoretical_loss": 4.77174427802216, + "tokens_seen": 100073472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897291875626881, + "loss": 3.7051, + "theoretical_loss": 4.771311193746191, + "tokens_seen": 100139008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897191574724173, + "loss": 3.7199, + "theoretical_loss": 4.770878472111465, + "tokens_seen": 100204544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897091273821465, + "loss": 3.7442, + "theoretical_loss": 4.770446112577445, + "tokens_seen": 100270080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896990972918756, + "loss": 3.9428, + "theoretical_loss": 4.770014114604756, + "tokens_seen": 100335616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896890672016048, + "loss": 3.7069, + "theoretical_loss": 4.769582477655177, + "tokens_seen": 100401152 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489679037111334, + "loss": 3.751, + "theoretical_loss": 4.769151201191641, + "tokens_seen": 100466688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896690070210632, + "loss": 3.6344, + "theoretical_loss": 4.768720284678228, + "tokens_seen": 100532224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896589769307924, + "loss": 3.6241, + "theoretical_loss": 4.768289727580161, + "tokens_seen": 100597760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896489468405215, + "loss": 3.8281, + "theoretical_loss": 4.767859529363809, + "tokens_seen": 100663296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896389167502507, + "loss": 3.8264, + "theoretical_loss": 4.767429689496682, + "tokens_seen": 100728832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896288866599799, + "loss": 3.4205, + "theoretical_loss": 4.767000207447417, + "tokens_seen": 100794368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896188565697091, + "loss": 3.7594, + "theoretical_loss": 4.766571082685794, + "tokens_seen": 100859904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896088264794384, + "loss": 3.4893, + "theoretical_loss": 4.766142314682716, + "tokens_seen": 100925440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895987963891674, + "loss": 3.841, + "theoretical_loss": 4.765713902910214, + "tokens_seen": 100990976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895887662988967, + "loss": 3.9616, + "theoretical_loss": 4.765285846841444, + "tokens_seen": 101056512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895787362086259, + "loss": 3.7223, + "theoretical_loss": 4.76485814595068, + "tokens_seen": 101122048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895687061183551, + "loss": 3.7445, + "theoretical_loss": 4.764430799713314, + "tokens_seen": 101187584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895586760280843, + "loss": 3.7522, + "theoretical_loss": 4.764003807605853, + "tokens_seen": 101253120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895486459378135, + "loss": 3.8225, + "theoretical_loss": 4.763577169105912, + "tokens_seen": 101318656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895386158475426, + "loss": 3.6539, + "theoretical_loss": 4.763150883692218, + "tokens_seen": 101384192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895285857572718, + "loss": 3.7513, + "theoretical_loss": 4.762724950844598, + "tokens_seen": 101449728 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489518555667001, + "loss": 3.8192, + "theoretical_loss": 4.762299370043984, + "tokens_seen": 101515264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 193223, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.837930202484131, + "objective/train/theoretical_loss": 4.761874140772408, + "objective/train/tokens_used": 122040800, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895085255767302, + "loss": 3.7441, + "theoretical_loss": 4.761874140772408, + "tokens_seen": 101580800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894984954864594, + "loss": 3.5863, + "theoretical_loss": 4.761449262512993, + "tokens_seen": 101646336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894884653961885, + "loss": 3.6207, + "theoretical_loss": 4.761024734749958, + "tokens_seen": 101711872 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894784353059178, + "loss": 3.7106, + "theoretical_loss": 4.76060055696861, + "tokens_seen": 101777408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894684052156469, + "loss": 3.6709, + "theoretical_loss": 4.760176728655345, + "tokens_seen": 101842944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894583751253762, + "loss": 3.5943, + "theoretical_loss": 4.75975324929764, + "tokens_seen": 101908480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894483450351053, + "loss": 3.5745, + "theoretical_loss": 4.759330118384053, + "tokens_seen": 101974016 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894383149448346, + "loss": 3.8006, + "theoretical_loss": 4.758907335404221, + "tokens_seen": 102039552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894282848545637, + "loss": 3.6088, + "theoretical_loss": 4.758484899848854, + "tokens_seen": 102105088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894182547642929, + "loss": 3.7947, + "theoretical_loss": 4.7580628112097365, + "tokens_seen": 102170624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894082246740221, + "loss": 3.7144, + "theoretical_loss": 4.7576410689797175, + "tokens_seen": 102236160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893981945837513, + "loss": 3.6313, + "theoretical_loss": 4.757219672652717, + "tokens_seen": 102301696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893881644934805, + "loss": 3.6257, + "theoretical_loss": 4.756798621723712, + "tokens_seen": 102367232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893781344032096, + "loss": 3.7725, + "theoretical_loss": 4.756377915688748, + "tokens_seen": 102432768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893681043129388, + "loss": 3.7932, + "theoretical_loss": 4.755957554044917, + "tokens_seen": 102498304 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489358074222668, + "loss": 3.7783, + "theoretical_loss": 4.755537536290373, + "tokens_seen": 102563840 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893480441323972, + "loss": 3.5894, + "theoretical_loss": 4.755117861924321, + "tokens_seen": 102629376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893380140421264, + "loss": 3.8479, + "theoretical_loss": 4.754698530447009, + "tokens_seen": 102694912 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893279839518556, + "loss": 3.6715, + "theoretical_loss": 4.754279541359738, + "tokens_seen": 102760448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893179538615847, + "loss": 3.8795, + "theoretical_loss": 4.753860894164845, + "tokens_seen": 102825984 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893079237713139, + "loss": 3.4681, + "theoretical_loss": 4.75344258836571, + "tokens_seen": 102891520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892978936810432, + "loss": 3.66, + "theoretical_loss": 4.753024623466752, + "tokens_seen": 102957056 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892878635907723, + "loss": 3.7237, + "theoretical_loss": 4.752606998973421, + "tokens_seen": 103022592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892778335005016, + "loss": 3.7994, + "theoretical_loss": 4.752189714392202, + "tokens_seen": 103088128 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892678034102307, + "loss": 3.6094, + "theoretical_loss": 4.7517727692306035, + "tokens_seen": 103153664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 194897, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6933863162994385, + "objective/train/theoretical_loss": 4.751356162997164, + "objective/train/tokens_used": 123679200, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892577733199599, + "loss": 3.6346, + "theoretical_loss": 4.751356162997164, + "tokens_seen": 103219200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892477432296891, + "loss": 3.7225, + "theoretical_loss": 4.750939895201443, + "tokens_seen": 103284736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892377131394183, + "loss": 3.8266, + "theoretical_loss": 4.750523965354024, + "tokens_seen": 103350272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892276830491475, + "loss": 3.5487, + "theoretical_loss": 4.750108372966501, + "tokens_seen": 103415808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892176529588767, + "loss": 3.8268, + "theoretical_loss": 4.749693117551491, + "tokens_seen": 103481344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892076228686058, + "loss": 3.6588, + "theoretical_loss": 4.749278198622617, + "tokens_seen": 103546880 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489197592778335, + "loss": 3.6327, + "theoretical_loss": 4.748863615694514, + "tokens_seen": 103612416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891875626880642, + "loss": 3.6437, + "theoretical_loss": 4.748449368282822, + "tokens_seen": 103677952 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891775325977934, + "loss": 3.7011, + "theoretical_loss": 4.748035455904185, + "tokens_seen": 103743488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891675025075226, + "loss": 3.6569, + "theoretical_loss": 4.747621878076252, + "tokens_seen": 103809024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891574724172517, + "loss": 3.717, + "theoretical_loss": 4.747208634317664, + "tokens_seen": 103874560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891474423269809, + "loss": 3.686, + "theoretical_loss": 4.746795724148061, + "tokens_seen": 103940096 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891374122367101, + "loss": 3.6645, + "theoretical_loss": 4.746383147088078, + "tokens_seen": 104005632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891273821464393, + "loss": 3.7336, + "theoretical_loss": 4.745970902659338, + "tokens_seen": 104071168 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891173520561686, + "loss": 3.7659, + "theoretical_loss": 4.745558990384451, + "tokens_seen": 104136704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004891073219658976, + "loss": 3.6579, + "theoretical_loss": 4.7451474097870125, + "tokens_seen": 104202240 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890972918756269, + "loss": 3.7532, + "theoretical_loss": 4.744736160391602, + "tokens_seen": 104267776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890872617853561, + "loss": 3.5556, + "theoretical_loss": 4.744325241723777, + "tokens_seen": 104333312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890772316950853, + "loss": 3.797, + "theoretical_loss": 4.743914653310073, + "tokens_seen": 104398848 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890672016048145, + "loss": 3.7419, + "theoretical_loss": 4.743504394678, + "tokens_seen": 104464384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890571715145437, + "loss": 3.6012, + "theoretical_loss": 4.743094465356039, + "tokens_seen": 104529920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890471414242728, + "loss": 3.7022, + "theoretical_loss": 4.742684864873641, + "tokens_seen": 104595456 + }, + { + "epoch": 0.03, + "learning_rate": 0.000489037111334002, + "loss": 3.6307, + "theoretical_loss": 4.742275592761223, + "tokens_seen": 104660992 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890270812437312, + "loss": 3.5759, + "theoretical_loss": 4.741866648550168, + "tokens_seen": 104726528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890170511534604, + "loss": 3.8154, + "theoretical_loss": 4.741458031772817, + "tokens_seen": 104792064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 197518, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4146454334259033, + "objective/train/theoretical_loss": 4.741049741962473, + "objective/train/tokens_used": 125317600, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890070210631896, + "loss": 3.5851, + "theoretical_loss": 4.741049741962473, + "tokens_seen": 104857600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889969909729187, + "loss": 3.6059, + "theoretical_loss": 4.740641778653395, + "tokens_seen": 104923136 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889869608826479, + "loss": 3.7144, + "theoretical_loss": 4.740234141380794, + "tokens_seen": 104988672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889769307923771, + "loss": 3.7424, + "theoretical_loss": 4.739826829680833, + "tokens_seen": 105054208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889669007021063, + "loss": 3.622, + "theoretical_loss": 4.739419843090626, + "tokens_seen": 105119744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889568706118355, + "loss": 3.6745, + "theoretical_loss": 4.739013181148229, + "tokens_seen": 105185280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889468405215647, + "loss": 3.6853, + "theoretical_loss": 4.738606843392644, + "tokens_seen": 105250816 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889368104312939, + "loss": 3.7924, + "theoretical_loss": 4.738200829363815, + "tokens_seen": 105316352 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488926780341023, + "loss": 3.7697, + "theoretical_loss": 4.737795138602624, + "tokens_seen": 105381888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889167502507523, + "loss": 3.7094, + "theoretical_loss": 4.737389770650887, + "tokens_seen": 105447424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889067201604815, + "loss": 3.5931, + "theoretical_loss": 4.736984725051357, + "tokens_seen": 105512960 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888966900702107, + "loss": 3.6468, + "theoretical_loss": 4.736580001347717, + "tokens_seen": 105578496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888866599799398, + "loss": 3.7407, + "theoretical_loss": 4.736175599084576, + "tokens_seen": 105644032 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488876629889669, + "loss": 3.6928, + "theoretical_loss": 4.735771517807473, + "tokens_seen": 105709568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888665997993982, + "loss": 3.6796, + "theoretical_loss": 4.735367757062869, + "tokens_seen": 105775104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888565697091274, + "loss": 3.829, + "theoretical_loss": 4.734964316398148, + "tokens_seen": 105840640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888465396188566, + "loss": 3.7105, + "theoretical_loss": 4.734561195361609, + "tokens_seen": 105906176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888365095285858, + "loss": 3.3967, + "theoretical_loss": 4.734158393502471, + "tokens_seen": 105971712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888264794383149, + "loss": 3.7597, + "theoretical_loss": 4.733755910370867, + "tokens_seen": 106037248 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888164493480441, + "loss": 3.839, + "theoretical_loss": 4.73335374551784, + "tokens_seen": 106102784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004888064192577733, + "loss": 3.8596, + "theoretical_loss": 4.732951898495341, + "tokens_seen": 106168320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887963891675025, + "loss": 3.6095, + "theoretical_loss": 4.7325503688562325, + "tokens_seen": 106233856 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887863590772317, + "loss": 3.7037, + "theoretical_loss": 4.732149156154276, + "tokens_seen": 106299392 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488776328986961, + "loss": 3.7386, + "theoretical_loss": 4.731748259944139, + "tokens_seen": 106364928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048876629889669, + "loss": 3.6249, + "theoretical_loss": 4.731347679781386, + "tokens_seen": 106430464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 200313, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4766695499420166, + "objective/train/theoretical_loss": 4.730947415222481, + "objective/train/tokens_used": 126956000, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887562688064193, + "loss": 3.586, + "theoretical_loss": 4.730947415222481, + "tokens_seen": 106496000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887462387161484, + "loss": 3.7208, + "theoretical_loss": 4.730547465824781, + "tokens_seen": 106561536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887362086258777, + "loss": 3.681, + "theoretical_loss": 4.730147831146537, + "tokens_seen": 106627072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887261785356069, + "loss": 3.7854, + "theoretical_loss": 4.72974851074689, + "tokens_seen": 106692608 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488716148445336, + "loss": 3.5136, + "theoretical_loss": 4.729349504185867, + "tokens_seen": 106758144 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887061183550652, + "loss": 3.5889, + "theoretical_loss": 4.728950811024383, + "tokens_seen": 106823680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886960882647944, + "loss": 3.7427, + "theoretical_loss": 4.7285524308242355, + "tokens_seen": 106889216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886860581745236, + "loss": 3.7865, + "theoretical_loss": 4.728154363148102, + "tokens_seen": 106954752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886760280842528, + "loss": 3.7202, + "theoretical_loss": 4.72775660755954, + "tokens_seen": 107020288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886659979939819, + "loss": 3.8236, + "theoretical_loss": 4.72735916362298, + "tokens_seen": 107085824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886559679037111, + "loss": 3.8278, + "theoretical_loss": 4.7269620309037315, + "tokens_seen": 107151360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886459378134403, + "loss": 3.5964, + "theoretical_loss": 4.726565208967973, + "tokens_seen": 107216896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886359077231695, + "loss": 3.726, + "theoretical_loss": 4.726168697382751, + "tokens_seen": 107282432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886258776328988, + "loss": 3.7337, + "theoretical_loss": 4.725772495715983, + "tokens_seen": 107347968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886158475426278, + "loss": 3.6917, + "theoretical_loss": 4.725376603536446, + "tokens_seen": 107413504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886058174523571, + "loss": 3.836, + "theoretical_loss": 4.724981020413787, + "tokens_seen": 107479040 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885957873620863, + "loss": 3.5967, + "theoretical_loss": 4.724585745918505, + "tokens_seen": 107544576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885857572718155, + "loss": 3.6473, + "theoretical_loss": 4.7241907796219635, + "tokens_seen": 107610112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885757271815447, + "loss": 3.7382, + "theoretical_loss": 4.723796121096381, + "tokens_seen": 107675648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885656970912739, + "loss": 3.773, + "theoretical_loss": 4.723401769914824, + "tokens_seen": 107741184 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488555667001003, + "loss": 3.7638, + "theoretical_loss": 4.723007725651219, + "tokens_seen": 107806720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885456369107322, + "loss": 3.7642, + "theoretical_loss": 4.722613987880335, + "tokens_seen": 107872256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885356068204614, + "loss": 3.5426, + "theoretical_loss": 4.722220556177792, + "tokens_seen": 107937792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885255767301906, + "loss": 3.6219, + "theoretical_loss": 4.721827430120053, + "tokens_seen": 108003328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885155466399198, + "loss": 3.3974, + "theoretical_loss": 4.721434609284424, + "tokens_seen": 108068864 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 203124, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.545315742492676, + "objective/train/theoretical_loss": 4.721042093249051, + "objective/train/tokens_used": 128594400, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488505516549649, + "loss": 3.6144, + "theoretical_loss": 4.721042093249051, + "tokens_seen": 108134400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884954864593781, + "loss": 3.6136, + "theoretical_loss": 4.720649881592919, + "tokens_seen": 108199936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884854563691073, + "loss": 3.6887, + "theoretical_loss": 4.7202579738958494, + "tokens_seen": 108265472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884754262788365, + "loss": 3.5893, + "theoretical_loss": 4.7198663697384955, + "tokens_seen": 108331008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884653961885657, + "loss": 3.6607, + "theoretical_loss": 4.719475068702346, + "tokens_seen": 108396544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884553660982949, + "loss": 3.49, + "theoretical_loss": 4.719084070369714, + "tokens_seen": 108462080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884453360080241, + "loss": 3.5747, + "theoretical_loss": 4.718693374323747, + "tokens_seen": 108527616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884353059177532, + "loss": 3.5555, + "theoretical_loss": 4.718302980148412, + "tokens_seen": 108593152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884252758274825, + "loss": 3.8743, + "theoretical_loss": 4.717912887428501, + "tokens_seen": 108658688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884152457372117, + "loss": 3.5749, + "theoretical_loss": 4.717523095749626, + "tokens_seen": 108724224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884052156469409, + "loss": 3.6322, + "theoretical_loss": 4.717133604698222, + "tokens_seen": 108789760 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048839518555667, + "loss": 3.6726, + "theoretical_loss": 4.7167444138615355, + "tokens_seen": 108855296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883851554663992, + "loss": 3.5148, + "theoretical_loss": 4.716355522827633, + "tokens_seen": 108920832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883751253761284, + "loss": 3.7259, + "theoretical_loss": 4.715966931185388, + "tokens_seen": 108986368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883650952858576, + "loss": 3.5415, + "theoretical_loss": 4.715578638524491, + "tokens_seen": 109051904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883550651955868, + "loss": 3.6574, + "theoretical_loss": 4.715190644435435, + "tokens_seen": 109117440 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488345035105316, + "loss": 3.7252, + "theoretical_loss": 4.714802948509522, + "tokens_seen": 109182976 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883350050150451, + "loss": 3.5987, + "theoretical_loss": 4.71441555033886, + "tokens_seen": 109248512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883249749247743, + "loss": 3.6932, + "theoretical_loss": 4.714028449516356, + "tokens_seen": 109314048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883149448345035, + "loss": 3.5379, + "theoretical_loss": 4.713641645635718, + "tokens_seen": 109379584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883049147442327, + "loss": 3.681, + "theoretical_loss": 4.713255138291454, + "tokens_seen": 109445120 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882948846539619, + "loss": 3.6972, + "theoretical_loss": 4.712868927078868, + "tokens_seen": 109510656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882848545636911, + "loss": 3.8093, + "theoretical_loss": 4.712483011594056, + "tokens_seen": 109576192 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882748244734203, + "loss": 3.737, + "theoretical_loss": 4.7120973914339075, + "tokens_seen": 109641728 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048826479438314946, + "loss": 3.2673, + "theoretical_loss": 4.7117120661961005, + "tokens_seen": 109707264 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 205882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.411952495574951, + "objective/train/theoretical_loss": 4.711327035479103, + "objective/train/tokens_used": 130232800, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048825476429287864, + "loss": 3.7665, + "theoretical_loss": 4.711327035479103, + "tokens_seen": 109772800 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882447342026078, + "loss": 3.7022, + "theoretical_loss": 4.710942298882169, + "tokens_seen": 109838336 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488234704112337, + "loss": 3.5817, + "theoretical_loss": 4.710557856005335, + "tokens_seen": 109903872 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048822467402206624, + "loss": 3.4355, + "theoretical_loss": 4.710173706449419, + "tokens_seen": 109969408 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048821464393179536, + "loss": 3.5931, + "theoretical_loss": 4.709789849816021, + "tokens_seen": 110034944 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882046138415246, + "loss": 3.6691, + "theoretical_loss": 4.7094062857075185, + "tokens_seen": 110100480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881945837512537, + "loss": 3.6156, + "theoretical_loss": 4.709023013727063, + "tokens_seen": 110166016 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048818455366098296, + "loss": 3.7604, + "theoretical_loss": 4.708640033478584, + "tokens_seen": 110231552 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048817452357071214, + "loss": 3.4604, + "theoretical_loss": 4.708257344566778, + "tokens_seen": 110297088 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881644934804413, + "loss": 3.5452, + "theoretical_loss": 4.7078749465971175, + "tokens_seen": 110362624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881544633901705, + "loss": 3.6627, + "theoretical_loss": 4.707492839175837, + "tokens_seen": 110428160 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048814443329989974, + "loss": 3.5653, + "theoretical_loss": 4.707111021909941, + "tokens_seen": 110493696 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048813440320962887, + "loss": 3.5681, + "theoretical_loss": 4.706729494407197, + "tokens_seen": 110559232 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004881243731193581, + "loss": 3.726, + "theoretical_loss": 4.706348256276138, + "tokens_seen": 110624768 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048811434302908723, + "loss": 3.7242, + "theoretical_loss": 4.705967307126051, + "tokens_seen": 110690304 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048810431293881646, + "loss": 3.5524, + "theoretical_loss": 4.705586646566987, + "tokens_seen": 110755840 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048809428284854564, + "loss": 3.5449, + "theoretical_loss": 4.705206274209751, + "tokens_seen": 110821376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880842527582748, + "loss": 3.7149, + "theoretical_loss": 4.704826189665905, + "tokens_seen": 110886912 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488074222668004, + "loss": 3.7531, + "theoretical_loss": 4.704446392547759, + "tokens_seen": 110952448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880641925777332, + "loss": 3.6661, + "theoretical_loss": 4.7040668824683785, + "tokens_seen": 111017984 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048805416248746237, + "loss": 3.7145, + "theoretical_loss": 4.7036876590415755, + "tokens_seen": 111083520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880441323971916, + "loss": 3.6576, + "theoretical_loss": 4.7033087218819105, + "tokens_seen": 111149056 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048803410230692073, + "loss": 3.6797, + "theoretical_loss": 4.7029300706046895, + "tokens_seen": 111214592 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048802407221664997, + "loss": 3.7227, + "theoretical_loss": 4.702551704825957, + "tokens_seen": 111280128 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048801404212637915, + "loss": 3.6336, + "theoretical_loss": 4.702173624162507, + "tokens_seen": 111345664 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 208312, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.573821544647217, + "objective/train/theoretical_loss": 4.701795828231866, + "objective/train/tokens_used": 131871200, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048800401203610833, + "loss": 3.6023, + "theoretical_loss": 4.701795828231866, + "tokens_seen": 111411200 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879939819458375, + "loss": 3.5403, + "theoretical_loss": 4.701418316652299, + "tokens_seen": 111476736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879839518555667, + "loss": 3.6287, + "theoretical_loss": 4.701041089042813, + "tokens_seen": 111542272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879739217652959, + "loss": 3.5451, + "theoretical_loss": 4.700664145023142, + "tokens_seen": 111607808 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879638916750251, + "loss": 3.6301, + "theoretical_loss": 4.700287484213753, + "tokens_seen": 111673344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879538615847543, + "loss": 3.8025, + "theoretical_loss": 4.699911106235849, + "tokens_seen": 111738880 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048794383149448347, + "loss": 3.6605, + "theoretical_loss": 4.6995350107113545, + "tokens_seen": 111804416 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048793380140421265, + "loss": 3.8372, + "theoretical_loss": 4.699159197262922, + "tokens_seen": 111869952 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048792377131394183, + "loss": 3.4682, + "theoretical_loss": 4.698783665513934, + "tokens_seen": 111935488 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048791374122367107, + "loss": 3.5544, + "theoretical_loss": 4.698408415088491, + "tokens_seen": 112001024 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879037111334002, + "loss": 3.5318, + "theoretical_loss": 4.698033445611415, + "tokens_seen": 112066560 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048789368104312943, + "loss": 3.5034, + "theoretical_loss": 4.6976587567082495, + "tokens_seen": 112132096 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048788365095285856, + "loss": 3.6678, + "theoretical_loss": 4.697284348005253, + "tokens_seen": 112197632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878736208625878, + "loss": 3.5465, + "theoretical_loss": 4.696910219129402, + "tokens_seen": 112263168 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048786359077231697, + "loss": 3.6581, + "theoretical_loss": 4.696536369708386, + "tokens_seen": 112328704 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048785356068204615, + "loss": 3.688, + "theoretical_loss": 4.696162799370606, + "tokens_seen": 112394240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048784353059177533, + "loss": 3.6517, + "theoretical_loss": 4.695789507745176, + "tokens_seen": 112459776 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048783350050150457, + "loss": 3.737, + "theoretical_loss": 4.695416494461917, + "tokens_seen": 112525312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004878234704112337, + "loss": 3.7629, + "theoretical_loss": 4.695043759151353, + "tokens_seen": 112590848 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048781344032096293, + "loss": 3.6696, + "theoretical_loss": 4.694671301444722, + "tokens_seen": 112656384 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048780341023069206, + "loss": 3.571, + "theoretical_loss": 4.694299120973957, + "tokens_seen": 112721920 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877933801404213, + "loss": 3.6666, + "theoretical_loss": 4.693927217371698, + "tokens_seen": 112787456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877833500501505, + "loss": 3.7417, + "theoretical_loss": 4.693555590271282, + "tokens_seen": 112852992 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048777331995987966, + "loss": 3.6998, + "theoretical_loss": 4.693184239306744, + "tokens_seen": 112918528 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048776328986960884, + "loss": 3.5575, + "theoretical_loss": 4.692813164112819, + "tokens_seen": 112984064 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 209733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.70513916015625, + "objective/train/theoretical_loss": 4.692442364324931, + "objective/train/tokens_used": 133509600, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487753259779338, + "loss": 3.661, + "theoretical_loss": 4.692442364324931, + "tokens_seen": 113049600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877432296890672, + "loss": 3.6031, + "theoretical_loss": 4.692071839579201, + "tokens_seen": 113115136 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048773319959879644, + "loss": 3.5727, + "theoretical_loss": 4.6917015895124425, + "tokens_seen": 113180672 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048772316950852556, + "loss": 3.5795, + "theoretical_loss": 4.691331613762153, + "tokens_seen": 113246208 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877131394182548, + "loss": 3.5552, + "theoretical_loss": 4.690961911966523, + "tokens_seen": 113311744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877031093279839, + "loss": 3.6073, + "theoretical_loss": 4.690592483764427, + "tokens_seen": 113377280 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048769307923771316, + "loss": 3.6342, + "theoretical_loss": 4.690223328795424, + "tokens_seen": 113442816 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048768304914744234, + "loss": 3.6074, + "theoretical_loss": 4.689854446699757, + "tokens_seen": 113508352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876730190571715, + "loss": 3.5164, + "theoretical_loss": 4.689485837118347, + "tokens_seen": 113573888 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876629889669007, + "loss": 3.6643, + "theoretical_loss": 4.689117499692798, + "tokens_seen": 113639424 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048765295887662994, + "loss": 3.706, + "theoretical_loss": 4.688749434065389, + "tokens_seen": 113704960 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048764292878635907, + "loss": 3.6544, + "theoretical_loss": 4.688381639879076, + "tokens_seen": 113770496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004876328986960883, + "loss": 3.7154, + "theoretical_loss": 4.68801411677749, + "tokens_seen": 113836032 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048762286860581743, + "loss": 3.6392, + "theoretical_loss": 4.687646864404934, + "tokens_seen": 113901568 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048761283851554666, + "loss": 3.4994, + "theoretical_loss": 4.687279882406381, + "tokens_seen": 113967104 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048760280842527584, + "loss": 3.6841, + "theoretical_loss": 4.686913170427477, + "tokens_seen": 114032640 + }, + { + "epoch": 0.03, + "learning_rate": 0.000487592778335005, + "loss": 3.6383, + "theoretical_loss": 4.68654672811453, + "tokens_seen": 114098176 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875827482447342, + "loss": 3.5446, + "theoretical_loss": 4.68618055511452, + "tokens_seen": 114163712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875727181544634, + "loss": 3.4725, + "theoretical_loss": 4.685814651075088, + "tokens_seen": 114229248 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048756268806419257, + "loss": 3.4847, + "theoretical_loss": 4.685449015644537, + "tokens_seen": 114294784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875526579739218, + "loss": 3.6249, + "theoretical_loss": 4.685083648471835, + "tokens_seen": 114360320 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048754262788365093, + "loss": 3.6153, + "theoretical_loss": 4.684718549206607, + "tokens_seen": 114425856 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048753259779338017, + "loss": 3.6543, + "theoretical_loss": 4.6843537174991345, + "tokens_seen": 114491392 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048752256770310935, + "loss": 3.6148, + "theoretical_loss": 4.6839891530003595, + "tokens_seen": 114556928 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048751253761283853, + "loss": 3.4385, + "theoretical_loss": 4.683624855361876, + "tokens_seen": 114622464 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 212300, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6393067836761475, + "objective/train/theoretical_loss": 4.68326082423593, + "objective/train/tokens_used": 135148000, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875025075225677, + "loss": 3.6024, + "theoretical_loss": 4.68326082423593, + "tokens_seen": 114688000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874924774322969, + "loss": 3.4481, + "theoretical_loss": 4.682897059275422, + "tokens_seen": 114753536 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048748244734202607, + "loss": 3.5405, + "theoretical_loss": 4.682533560133901, + "tokens_seen": 114819072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874724172517553, + "loss": 3.5827, + "theoretical_loss": 4.682170326465565, + "tokens_seen": 114884608 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048746238716148443, + "loss": 3.5291, + "theoretical_loss": 4.681807357925257, + "tokens_seen": 114950144 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048745235707121367, + "loss": 3.6217, + "theoretical_loss": 4.681444654168468, + "tokens_seen": 115015680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874423269809428, + "loss": 3.4083, + "theoretical_loss": 4.68108221485133, + "tokens_seen": 115081216 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048743229689067203, + "loss": 3.5536, + "theoretical_loss": 4.680720039630617, + "tokens_seen": 115146752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874222668004012, + "loss": 3.6552, + "theoretical_loss": 4.680358128163747, + "tokens_seen": 115212288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874122367101304, + "loss": 3.7242, + "theoretical_loss": 4.679996480108773, + "tokens_seen": 115277824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004874022066198596, + "loss": 3.5851, + "theoretical_loss": 4.6796350951243895, + "tokens_seen": 115343360 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048739217652958876, + "loss": 3.5223, + "theoretical_loss": 4.679273972869922, + "tokens_seen": 115408896 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048738214643931794, + "loss": 3.5889, + "theoretical_loss": 4.678913113005333, + "tokens_seen": 115474432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048737211634904717, + "loss": 3.6441, + "theoretical_loss": 4.6785525151912175, + "tokens_seen": 115539968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873620862587763, + "loss": 3.7861, + "theoretical_loss": 4.678192179088802, + "tokens_seen": 115605504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048735205616850553, + "loss": 3.7474, + "theoretical_loss": 4.6778321043599425, + "tokens_seen": 115671040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873420260782347, + "loss": 3.6169, + "theoretical_loss": 4.677472290667122, + "tokens_seen": 115736576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873319959879639, + "loss": 3.6999, + "theoretical_loss": 4.677112737673453, + "tokens_seen": 115802112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004873219658976931, + "loss": 3.5509, + "theoretical_loss": 4.676753445042669, + "tokens_seen": 115867648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048731193580742226, + "loss": 3.7054, + "theoretical_loss": 4.676394412439132, + "tokens_seen": 115933184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048730190571715144, + "loss": 3.4979, + "theoretical_loss": 4.6760356395278215, + "tokens_seen": 115998720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872918756268807, + "loss": 3.4799, + "theoretical_loss": 4.675677125974339, + "tokens_seen": 116064256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872818455366098, + "loss": 3.5065, + "theoretical_loss": 4.675318871444908, + "tokens_seen": 116129792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048727181544633904, + "loss": 3.5598, + "theoretical_loss": 4.674960875606366, + "tokens_seen": 116195328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048726178535606816, + "loss": 3.64, + "theoretical_loss": 4.674603138126168, + "tokens_seen": 116260864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 215181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.450789451599121, + "objective/train/theoretical_loss": 4.674245658672382, + "objective/train/tokens_used": 136786400, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872517552657974, + "loss": 3.5709, + "theoretical_loss": 4.674245658672382, + "tokens_seen": 116326400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872417251755266, + "loss": 3.7386, + "theoretical_loss": 4.673888436913694, + "tokens_seen": 116391936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048723169508525576, + "loss": 3.686, + "theoretical_loss": 4.673531472519397, + "tokens_seen": 116457472 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487221664994985, + "loss": 3.7699, + "theoretical_loss": 4.673174765159393, + "tokens_seen": 116523008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004872116349047141, + "loss": 3.6723, + "theoretical_loss": 4.672818314504198, + "tokens_seen": 116588544 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048720160481444336, + "loss": 3.6352, + "theoretical_loss": 4.6724621202249335, + "tokens_seen": 116654080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048719157472417254, + "loss": 3.6094, + "theoretical_loss": 4.672106181993324, + "tokens_seen": 116719616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871815446339017, + "loss": 3.7183, + "theoretical_loss": 4.6717504994817, + "tokens_seen": 116785152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871715145436309, + "loss": 3.6837, + "theoretical_loss": 4.671395072362996, + "tokens_seen": 116850688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048716148445336014, + "loss": 3.7233, + "theoretical_loss": 4.671039900310747, + "tokens_seen": 116916224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048715145436308927, + "loss": 3.6096, + "theoretical_loss": 4.670684982999088, + "tokens_seen": 116981760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871414242728185, + "loss": 3.6379, + "theoretical_loss": 4.670330320102753, + "tokens_seen": 117047296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048713139418254763, + "loss": 3.6043, + "theoretical_loss": 4.669975911297072, + "tokens_seen": 117112832 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048712136409227686, + "loss": 3.6982, + "theoretical_loss": 4.669621756257971, + "tokens_seen": 117178368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048711133400200604, + "loss": 3.7012, + "theoretical_loss": 4.669267854661973, + "tokens_seen": 117243904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871013039117352, + "loss": 3.6457, + "theoretical_loss": 4.668914206186189, + "tokens_seen": 117309440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870912738214644, + "loss": 3.5559, + "theoretical_loss": 4.6685608105083265, + "tokens_seen": 117374976 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870812437311936, + "loss": 3.5941, + "theoretical_loss": 4.66820766730668, + "tokens_seen": 117440512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048707121364092277, + "loss": 3.4959, + "theoretical_loss": 4.667854776260132, + "tokens_seen": 117506048 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487061183550652, + "loss": 3.5518, + "theoretical_loss": 4.667502137048155, + "tokens_seen": 117571584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048705115346038113, + "loss": 3.8162, + "theoretical_loss": 4.667149749350805, + "tokens_seen": 117637120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048704112337011037, + "loss": 3.678, + "theoretical_loss": 4.666797612848723, + "tokens_seen": 117702656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048703109327983955, + "loss": 3.7429, + "theoretical_loss": 4.666445727223134, + "tokens_seen": 117768192 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048702106318956873, + "loss": 3.6926, + "theoretical_loss": 4.666094092155843, + "tokens_seen": 117833728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870110330992979, + "loss": 3.6113, + "theoretical_loss": 4.665742707329238, + "tokens_seen": 117899264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 218087, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4697864055633545, + "objective/train/theoretical_loss": 4.665391572426282, + "objective/train/tokens_used": 138424800, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870010030090271, + "loss": 3.6964, + "theoretical_loss": 4.665391572426282, + "tokens_seen": 117964800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048699097291875627, + "loss": 3.5604, + "theoretical_loss": 4.665040687130518, + "tokens_seen": 118030336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869809428284855, + "loss": 3.5842, + "theoretical_loss": 4.664690051126065, + "tokens_seen": 118095872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048697091273821463, + "loss": 3.783, + "theoretical_loss": 4.664339664097617, + "tokens_seen": 118161408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048696088264794387, + "loss": 3.729, + "theoretical_loss": 4.66398952573044, + "tokens_seen": 118226944 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486950852557673, + "loss": 3.6628, + "theoretical_loss": 4.663639635710373, + "tokens_seen": 118292480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048694082246740223, + "loss": 3.6035, + "theoretical_loss": 4.663289993723826, + "tokens_seen": 118358016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869307923771314, + "loss": 3.7186, + "theoretical_loss": 4.662940599457777, + "tokens_seen": 118423552 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869207622868606, + "loss": 3.6415, + "theoretical_loss": 4.662591452599774, + "tokens_seen": 118489088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869107321965898, + "loss": 3.6816, + "theoretical_loss": 4.662242552837929, + "tokens_seen": 118554624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048690070210631896, + "loss": 3.6966, + "theoretical_loss": 4.661893899860923, + "tokens_seen": 118620160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048689067201604814, + "loss": 3.6318, + "theoretical_loss": 4.6615454933579965, + "tokens_seen": 118685696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048688064192577737, + "loss": 3.6154, + "theoretical_loss": 4.661197333018957, + "tokens_seen": 118751232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868706118355065, + "loss": 3.6098, + "theoretical_loss": 4.66084941853417, + "tokens_seen": 118816768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048686058174523573, + "loss": 3.663, + "theoretical_loss": 4.6605017495945615, + "tokens_seen": 118882304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868505516549649, + "loss": 3.7005, + "theoretical_loss": 4.660154325891618, + "tokens_seen": 118947840 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868405215646941, + "loss": 3.7505, + "theoretical_loss": 4.659807147117382, + "tokens_seen": 119013376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868304914744233, + "loss": 3.6065, + "theoretical_loss": 4.6594602129644525, + "tokens_seen": 119078912 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048682046138415246, + "loss": 3.6243, + "theoretical_loss": 4.659113523125981, + "tokens_seen": 119144448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048681043129388164, + "loss": 3.6437, + "theoretical_loss": 4.6587670772956775, + "tokens_seen": 119209984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004868004012036109, + "loss": 3.6719, + "theoretical_loss": 4.658420875167799, + "tokens_seen": 119275520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048679037111334, + "loss": 3.5976, + "theoretical_loss": 4.658074916437155, + "tokens_seen": 119341056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048678034102306924, + "loss": 3.4214, + "theoretical_loss": 4.657729200799105, + "tokens_seen": 119406592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048677031093279836, + "loss": 3.6385, + "theoretical_loss": 4.657383727949558, + "tokens_seen": 119472128 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867602808425276, + "loss": 3.8363, + "theoretical_loss": 4.657038497584967, + "tokens_seen": 119537664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 220671, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.458449125289917, + "objective/train/theoretical_loss": 4.656693509402331, + "objective/train/tokens_used": 140063200, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867502507522568, + "loss": 3.6734, + "theoretical_loss": 4.656693509402331, + "tokens_seen": 119603200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048674022066198596, + "loss": 3.6732, + "theoretical_loss": 4.6563487630991975, + "tokens_seen": 119668736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048673019057171514, + "loss": 3.6713, + "theoretical_loss": 4.656004258373651, + "tokens_seen": 119734272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867201604814443, + "loss": 3.6181, + "theoretical_loss": 4.655659994924323, + "tokens_seen": 119799808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867101303911735, + "loss": 3.6155, + "theoretical_loss": 4.655315972450383, + "tokens_seen": 119865344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048670010030090274, + "loss": 3.5514, + "theoretical_loss": 4.65497219065154, + "tokens_seen": 119930880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048669007021063187, + "loss": 3.8192, + "theoretical_loss": 4.654628649228041, + "tokens_seen": 119996416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866800401203611, + "loss": 3.5989, + "theoretical_loss": 4.654285347880672, + "tokens_seen": 120061952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004866700100300903, + "loss": 3.498, + "theoretical_loss": 4.653942286310749, + "tokens_seen": 120127488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048665997993981947, + "loss": 3.5397, + "theoretical_loss": 4.653599464220129, + "tokens_seen": 120193024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048664994984954865, + "loss": 3.6488, + "theoretical_loss": 4.653256881311198, + "tokens_seen": 120258560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048663991975927783, + "loss": 3.5823, + "theoretical_loss": 4.6529145372868745, + "tokens_seen": 120324096 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486629889669007, + "loss": 3.4156, + "theoretical_loss": 4.652572431850608, + "tokens_seen": 120389632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048661985957873624, + "loss": 3.7352, + "theoretical_loss": 4.652230564706377, + "tokens_seen": 120455168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048660982948846537, + "loss": 3.5848, + "theoretical_loss": 4.651888935558688, + "tokens_seen": 120520704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865997993981946, + "loss": 3.6173, + "theoretical_loss": 4.651547544112575, + "tokens_seen": 120586240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048658976930792373, + "loss": 3.7562, + "theoretical_loss": 4.651206390073597, + "tokens_seen": 120651776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048657973921765297, + "loss": 3.7376, + "theoretical_loss": 4.650865473147837, + "tokens_seen": 120717312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048656970912738215, + "loss": 3.4611, + "theoretical_loss": 4.650524793041903, + "tokens_seen": 120782848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048655967903711133, + "loss": 3.6694, + "theoretical_loss": 4.650184349462922, + "tokens_seen": 120848384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865496489468405, + "loss": 3.8027, + "theoretical_loss": 4.649844142118544, + "tokens_seen": 120913920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048653961885656975, + "loss": 3.731, + "theoretical_loss": 4.6495041707169396, + "tokens_seen": 120979456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865295887662989, + "loss": 3.7667, + "theoretical_loss": 4.649164434966794, + "tokens_seen": 121044992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004865195586760281, + "loss": 3.505, + "theoretical_loss": 4.648824934577313, + "tokens_seen": 121110528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048650952858575724, + "loss": 3.6041, + "theoretical_loss": 4.648485669258216, + "tokens_seen": 121176064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 223562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6140758991241455, + "objective/train/theoretical_loss": 4.648146638719739, + "objective/train/tokens_used": 141701600, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048649949849548647, + "loss": 3.4713, + "theoretical_loss": 4.648146638719739, + "tokens_seen": 121241600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048648946840521565, + "loss": 3.5895, + "theoretical_loss": 4.647807842672631, + "tokens_seen": 121307136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048647943831494483, + "loss": 3.4733, + "theoretical_loss": 4.647469280828153, + "tokens_seen": 121372672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048646940822467407, + "loss": 3.6673, + "theoretical_loss": 4.647130952898077, + "tokens_seen": 121438208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864593781344032, + "loss": 3.6227, + "theoretical_loss": 4.646792858594686, + "tokens_seen": 121503744 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048644934804413243, + "loss": 3.673, + "theoretical_loss": 4.64645499763077, + "tokens_seen": 121569280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864393179538616, + "loss": 3.6762, + "theoretical_loss": 4.646117369719629, + "tokens_seen": 121634816 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004864292878635908, + "loss": 3.551, + "theoretical_loss": 4.645779974575069, + "tokens_seen": 121700352 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048641925777332, + "loss": 3.4529, + "theoretical_loss": 4.6454428119113995, + "tokens_seen": 121765888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048640922768304916, + "loss": 3.5687, + "theoretical_loss": 4.6451058814434365, + "tokens_seen": 121831424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048639919759277834, + "loss": 3.5279, + "theoretical_loss": 4.644769182886495, + "tokens_seen": 121896960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048638916750250757, + "loss": 3.7372, + "theoretical_loss": 4.644432715956399, + "tokens_seen": 121962496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863791374122367, + "loss": 3.5191, + "theoretical_loss": 4.644096480369466, + "tokens_seen": 122028032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048636910732196593, + "loss": 3.7421, + "theoretical_loss": 4.643760475842518, + "tokens_seen": 122093568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863590772316951, + "loss": 3.7829, + "theoretical_loss": 4.6434247020928705, + "tokens_seen": 122159104 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863490471414243, + "loss": 3.6374, + "theoretical_loss": 4.643089158838341, + "tokens_seen": 122224640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863390170511535, + "loss": 3.5717, + "theoretical_loss": 4.642753845797243, + "tokens_seen": 122290176 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048632898696088266, + "loss": 3.473, + "theoretical_loss": 4.642418762688379, + "tokens_seen": 122355712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048631895687061184, + "loss": 3.5659, + "theoretical_loss": 4.642083909231053, + "tokens_seen": 122421248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004863089267803411, + "loss": 3.5956, + "theoretical_loss": 4.641749285145057, + "tokens_seen": 122486784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862988966900702, + "loss": 3.5018, + "theoretical_loss": 4.641414890150675, + "tokens_seen": 122552320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048628886659979944, + "loss": 3.7073, + "theoretical_loss": 4.641080723968684, + "tokens_seen": 122617856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048627883650952857, + "loss": 3.4656, + "theoretical_loss": 4.6407467863203475, + "tokens_seen": 122683392 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862688064192578, + "loss": 3.7052, + "theoretical_loss": 4.640413076927418, + "tokens_seen": 122748928 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486258776328987, + "loss": 3.6263, + "theoretical_loss": 4.6400795955121374, + "tokens_seen": 122814464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 226297, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4445888996124268, + "objective/train/theoretical_loss": 4.639746341797229, + "objective/train/tokens_used": 143340000, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048624874623871616, + "loss": 3.4454, + "theoretical_loss": 4.639746341797229, + "tokens_seen": 122880000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048623871614844534, + "loss": 3.5496, + "theoretical_loss": 4.639413315505905, + "tokens_seen": 122945536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862286860581745, + "loss": 3.6716, + "theoretical_loss": 4.639080516361861, + "tokens_seen": 123011072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862186559679037, + "loss": 3.5383, + "theoretical_loss": 4.638747944089273, + "tokens_seen": 123076608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048620862587763294, + "loss": 3.6888, + "theoretical_loss": 4.638415598412799, + "tokens_seen": 123142144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048619859578736207, + "loss": 3.5903, + "theoretical_loss": 4.638083479057579, + "tokens_seen": 123207680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861885656970913, + "loss": 3.5566, + "theoretical_loss": 4.637751585749234, + "tokens_seen": 123273216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861785356068205, + "loss": 3.5825, + "theoretical_loss": 4.6374199182138565, + "tokens_seen": 123338752 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048616850551654967, + "loss": 3.5623, + "theoretical_loss": 4.637088476178025, + "tokens_seen": 123404288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048615847542627885, + "loss": 3.6986, + "theoretical_loss": 4.636757259368787, + "tokens_seen": 123469824 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048614844533600803, + "loss": 3.6858, + "theoretical_loss": 4.636426267513668, + "tokens_seen": 123535360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861384152457372, + "loss": 3.5632, + "theoretical_loss": 4.636095500340669, + "tokens_seen": 123600896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048612838515546644, + "loss": 3.6303, + "theoretical_loss": 4.635764957578261, + "tokens_seen": 123666432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048611835506519557, + "loss": 3.6519, + "theoretical_loss": 4.635434638955388, + "tokens_seen": 123731968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004861083249749248, + "loss": 3.6202, + "theoretical_loss": 4.635104544201465, + "tokens_seen": 123797504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048609829488465393, + "loss": 3.4646, + "theoretical_loss": 4.634774673046376, + "tokens_seen": 123863040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048608826479438317, + "loss": 3.5565, + "theoretical_loss": 4.634445025220475, + "tokens_seen": 123928576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048607823470411235, + "loss": 3.5219, + "theoretical_loss": 4.634115600454582, + "tokens_seen": 123994112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048606820461384153, + "loss": 3.5513, + "theoretical_loss": 4.633786398479983, + "tokens_seen": 124059648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860581745235707, + "loss": 3.6201, + "theoretical_loss": 4.6334574190284314, + "tokens_seen": 124125184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048604814443329995, + "loss": 3.8525, + "theoretical_loss": 4.633128661832145, + "tokens_seen": 124190720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860381143430291, + "loss": 3.5434, + "theoretical_loss": 4.632800126623803, + "tokens_seen": 124256256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004860280842527583, + "loss": 3.6372, + "theoretical_loss": 4.632471813136547, + "tokens_seen": 124321792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048601805416248744, + "loss": 3.6406, + "theoretical_loss": 4.632143721103983, + "tokens_seen": 124387328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048600802407221667, + "loss": 3.6116, + "theoretical_loss": 4.631815850260173, + "tokens_seen": 124452864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 229052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.710355043411255, + "objective/train/theoretical_loss": 4.631488200339643, + "objective/train/tokens_used": 144978400, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048599799398194585, + "loss": 3.6702, + "theoretical_loss": 4.631488200339643, + "tokens_seen": 124518400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048598796389167503, + "loss": 3.6719, + "theoretical_loss": 4.63116077107737, + "tokens_seen": 124583936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859779338014042, + "loss": 3.6649, + "theoretical_loss": 4.630833562208797, + "tokens_seen": 124649472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859679037111334, + "loss": 3.5806, + "theoretical_loss": 4.630506573469815, + "tokens_seen": 124715008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859578736208626, + "loss": 3.5875, + "theoretical_loss": 4.630179804596775, + "tokens_seen": 124780544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859478435305918, + "loss": 3.5722, + "theoretical_loss": 4.629853255326481, + "tokens_seen": 124846080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048593781344032094, + "loss": 3.6885, + "theoretical_loss": 4.629526925396189, + "tokens_seen": 124911616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859277833500502, + "loss": 3.5791, + "theoretical_loss": 4.6292008145436085, + "tokens_seen": 124977152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859177532597793, + "loss": 3.6502, + "theoretical_loss": 4.628874922506897, + "tokens_seen": 125042688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048590772316950854, + "loss": 3.5884, + "theoretical_loss": 4.628549249024666, + "tokens_seen": 125108224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858976930792377, + "loss": 3.4793, + "theoretical_loss": 4.628223793835975, + "tokens_seen": 125173760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858876629889669, + "loss": 3.7404, + "theoretical_loss": 4.627898556680327, + "tokens_seen": 125239296 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858776328986961, + "loss": 3.4483, + "theoretical_loss": 4.627573537297678, + "tokens_seen": 125304832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858676028084253, + "loss": 3.6227, + "theoretical_loss": 4.627248735428427, + "tokens_seen": 125370368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048585757271815444, + "loss": 3.5744, + "theoretical_loss": 4.6269241508134185, + "tokens_seen": 125435904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858475426278837, + "loss": 3.4956, + "theoretical_loss": 4.6265997831939405, + "tokens_seen": 125501440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858375125376128, + "loss": 3.6002, + "theoretical_loss": 4.6262756323117245, + "tokens_seen": 125566976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048582748244734204, + "loss": 3.5851, + "theoretical_loss": 4.625951697908944, + "tokens_seen": 125632512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858174523570712, + "loss": 3.743, + "theoretical_loss": 4.625627979728212, + "tokens_seen": 125698048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004858074222668004, + "loss": 3.5072, + "theoretical_loss": 4.625304477512584, + "tokens_seen": 125763584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857973921765296, + "loss": 3.5007, + "theoretical_loss": 4.624981191005554, + "tokens_seen": 125829120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048578736208625877, + "loss": 3.7045, + "theoretical_loss": 4.624658119951052, + "tokens_seen": 125894656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048577733199598795, + "loss": 3.5168, + "theoretical_loss": 4.624335264093447, + "tokens_seen": 125960192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857673019057172, + "loss": 3.7836, + "theoretical_loss": 4.624012623177544, + "tokens_seen": 126025728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857572718154463, + "loss": 3.6721, + "theoretical_loss": 4.623690196948582, + "tokens_seen": 126091264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 230495, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.500173568725586, + "objective/train/theoretical_loss": 4.623367985152234, + "objective/train/tokens_used": 146616800, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048574724172517554, + "loss": 3.6381, + "theoretical_loss": 4.623367985152234, + "tokens_seen": 126156800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048573721163490467, + "loss": 3.6023, + "theoretical_loss": 4.623045987534609, + "tokens_seen": 126222336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857271815446339, + "loss": 3.473, + "theoretical_loss": 4.622724203842246, + "tokens_seen": 126287872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048571715145436314, + "loss": 3.6153, + "theoretical_loss": 4.622402633822114, + "tokens_seen": 126353408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048570712136409227, + "loss": 3.6012, + "theoretical_loss": 4.622081277221616, + "tokens_seen": 126418944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856970912738215, + "loss": 3.5613, + "theoretical_loss": 4.62176013378858, + "tokens_seen": 126484480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856870611835507, + "loss": 3.614, + "theoretical_loss": 4.621439203271267, + "tokens_seen": 126550016 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048567703109327987, + "loss": 3.6849, + "theoretical_loss": 4.621118485418362, + "tokens_seen": 126615552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048566700100300905, + "loss": 3.5198, + "theoretical_loss": 4.620797979978978, + "tokens_seen": 126681088 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048565697091273823, + "loss": 3.6159, + "theoretical_loss": 4.620477686702651, + "tokens_seen": 126746624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856469408224674, + "loss": 3.8222, + "theoretical_loss": 4.620157605339347, + "tokens_seen": 126812160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048563691073219664, + "loss": 3.6207, + "theoretical_loss": 4.619837735639452, + "tokens_seen": 126877696 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048562688064192577, + "loss": 3.5069, + "theoretical_loss": 4.619518077353776, + "tokens_seen": 126943232 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485616850551655, + "loss": 3.7153, + "theoretical_loss": 4.619198630233547, + "tokens_seen": 127008768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048560682046138413, + "loss": 3.3893, + "theoretical_loss": 4.6188793940304205, + "tokens_seen": 127074304 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048559679037111337, + "loss": 3.5857, + "theoretical_loss": 4.618560368496466, + "tokens_seen": 127139840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048558676028084255, + "loss": 3.5823, + "theoretical_loss": 4.618241553384175, + "tokens_seen": 127205376 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048557673019057173, + "loss": 3.5309, + "theoretical_loss": 4.617922948446459, + "tokens_seen": 127270912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855667001003009, + "loss": 3.6006, + "theoretical_loss": 4.617604553436642, + "tokens_seen": 127336448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048555667001003015, + "loss": 3.605, + "theoretical_loss": 4.617286368108466, + "tokens_seen": 127401984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855466399197593, + "loss": 3.6009, + "theoretical_loss": 4.6169683922160925, + "tokens_seen": 127467520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004855366098294885, + "loss": 3.4727, + "theoretical_loss": 4.616650625514091, + "tokens_seen": 127533056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048552657973921764, + "loss": 3.7519, + "theoretical_loss": 4.616333067757449, + "tokens_seen": 127598592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048551654964894687, + "loss": 3.4998, + "theoretical_loss": 4.616015718701563, + "tokens_seen": 127664128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048550651955867605, + "loss": 3.7471, + "theoretical_loss": 4.615698578102245, + "tokens_seen": 127729664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 233438, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.866485118865967, + "objective/train/theoretical_loss": 4.615381645715717, + "objective/train/tokens_used": 148255200, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048549648946840523, + "loss": 3.5829, + "theoretical_loss": 4.615381645715717, + "tokens_seen": 127795200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854864593781344, + "loss": 3.4515, + "theoretical_loss": 4.615064921298608, + "tokens_seen": 127860736 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854764292878636, + "loss": 3.612, + "theoretical_loss": 4.61474840460796, + "tokens_seen": 127926272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854663991975928, + "loss": 3.7819, + "theoretical_loss": 4.614432095401219, + "tokens_seen": 127991808 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485456369107322, + "loss": 3.6219, + "theoretical_loss": 4.614115993436242, + "tokens_seen": 128057344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048544633901705114, + "loss": 3.7261, + "theoretical_loss": 4.613800098471291, + "tokens_seen": 128122880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854363089267804, + "loss": 3.6489, + "theoretical_loss": 4.613484410265032, + "tokens_seen": 128188416 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854262788365095, + "loss": 3.5794, + "theoretical_loss": 4.613168928576538, + "tokens_seen": 128253952 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048541624874623874, + "loss": 3.6887, + "theoretical_loss": 4.612853653165283, + "tokens_seen": 128319488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004854062186559679, + "loss": 3.611, + "theoretical_loss": 4.612538583791146, + "tokens_seen": 128385024 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853961885656971, + "loss": 3.4738, + "theoretical_loss": 4.612223720214407, + "tokens_seen": 128450560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853861584754263, + "loss": 3.3578, + "theoretical_loss": 4.611909062195749, + "tokens_seen": 128516096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853761283851555, + "loss": 3.4128, + "theoretical_loss": 4.61159460949625, + "tokens_seen": 128581632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048536609829488464, + "loss": 3.6116, + "theoretical_loss": 4.611280361877393, + "tokens_seen": 128647168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853560682046139, + "loss": 3.4273, + "theoretical_loss": 4.610966319101056, + "tokens_seen": 128712704 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485346038114343, + "loss": 3.7758, + "theoretical_loss": 4.610652480929515, + "tokens_seen": 128778240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048533600802407224, + "loss": 3.6812, + "theoretical_loss": 4.610338847125445, + "tokens_seen": 128843776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853259779338014, + "loss": 3.6136, + "theoretical_loss": 4.610025417451913, + "tokens_seen": 128909312 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853159478435306, + "loss": 3.746, + "theoretical_loss": 4.6097121916723856, + "tokens_seen": 128974848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853059177532598, + "loss": 3.5145, + "theoretical_loss": 4.609399169550718, + "tokens_seen": 129040384 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048529588766298897, + "loss": 3.5438, + "theoretical_loss": 4.609086350851165, + "tokens_seen": 129105920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048528585757271815, + "loss": 3.5249, + "theoretical_loss": 4.6087737353383655, + "tokens_seen": 129171456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852758274824474, + "loss": 3.5502, + "theoretical_loss": 4.6084613227773605, + "tokens_seen": 129236992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852657973921765, + "loss": 3.76, + "theoretical_loss": 4.608149112933571, + "tokens_seen": 129302528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048525576730190574, + "loss": 3.6529, + "theoretical_loss": 4.607837105572816, + "tokens_seen": 129368064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 235763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.746123790740967, + "objective/train/theoretical_loss": 4.607525300461299, + "objective/train/tokens_used": 149893600, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048524573721163487, + "loss": 3.519, + "theoretical_loss": 4.607525300461299, + "tokens_seen": 129433600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852357071213641, + "loss": 3.4473, + "theoretical_loss": 4.607213697365613, + "tokens_seen": 129499136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852256770310933, + "loss": 3.6802, + "theoretical_loss": 4.606902296052739, + "tokens_seen": 129564672 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048521564694082247, + "loss": 3.4782, + "theoretical_loss": 4.6065910962900425, + "tokens_seen": 129630208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048520561685055165, + "loss": 3.5924, + "theoretical_loss": 4.606280097845277, + "tokens_seen": 129695744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851955867602809, + "loss": 3.6023, + "theoretical_loss": 4.60596930048658, + "tokens_seen": 129761280 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048518555667001, + "loss": 3.7175, + "theoretical_loss": 4.605658703982471, + "tokens_seen": 129826816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048517552657973925, + "loss": 3.5995, + "theoretical_loss": 4.6053483081018545, + "tokens_seen": 129892352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851654964894684, + "loss": 3.3884, + "theoretical_loss": 4.605038112614018, + "tokens_seen": 129957888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851554663991976, + "loss": 3.588, + "theoretical_loss": 4.604728117288631, + "tokens_seen": 130023424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851454363089268, + "loss": 3.5162, + "theoretical_loss": 4.604418321895739, + "tokens_seen": 130088960 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048513540621865597, + "loss": 3.5941, + "theoretical_loss": 4.604108726205774, + "tokens_seen": 130154496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048512537612838515, + "loss": 3.6495, + "theoretical_loss": 4.60379932998954, + "tokens_seen": 130220032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048511534603811433, + "loss": 3.5774, + "theoretical_loss": 4.6034901330182265, + "tokens_seen": 130285568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004851053159478435, + "loss": 3.5778, + "theoretical_loss": 4.603181135063394, + "tokens_seen": 130351104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048509528585757275, + "loss": 3.5937, + "theoretical_loss": 4.6028723358969845, + "tokens_seen": 130416640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850852557673019, + "loss": 3.5829, + "theoretical_loss": 4.602563735291312, + "tokens_seen": 130482176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850752256770311, + "loss": 3.5905, + "theoretical_loss": 4.602255333019068, + "tokens_seen": 130547712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048506519558676024, + "loss": 3.6318, + "theoretical_loss": 4.6019471288533165, + "tokens_seen": 130613248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850551654964895, + "loss": 3.4687, + "theoretical_loss": 4.601639122567497, + "tokens_seen": 130678784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048504513540621866, + "loss": 3.5505, + "theoretical_loss": 4.601331313935418, + "tokens_seen": 130744320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048503510531594784, + "loss": 3.6667, + "theoretical_loss": 4.601023702731264, + "tokens_seen": 130809856 + }, + { + "epoch": 0.04, + "learning_rate": 0.000485025075225677, + "loss": 3.4267, + "theoretical_loss": 4.600716288729587, + "tokens_seen": 130875392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048501504513540625, + "loss": 3.5855, + "theoretical_loss": 4.600409071705312, + "tokens_seen": 130940928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004850050150451354, + "loss": 3.6781, + "theoretical_loss": 4.60010205143373, + "tokens_seen": 131006464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 238624, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6152400970458984, + "objective/train/theoretical_loss": 4.599795227690505, + "objective/train/tokens_used": 151532000, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849949849548646, + "loss": 3.72, + "theoretical_loss": 4.599795227690505, + "tokens_seen": 131072000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849849548645938, + "loss": 3.6126, + "theoretical_loss": 4.5994886002516635, + "tokens_seen": 131137536 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484974924774323, + "loss": 3.6646, + "theoretical_loss": 4.599182168893604, + "tokens_seen": 131203072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849648946840522, + "loss": 3.2893, + "theoretical_loss": 4.598875933393089, + "tokens_seen": 131268608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048495486459378134, + "loss": 3.6764, + "theoretical_loss": 4.5985698935272445, + "tokens_seen": 131334144 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849448345035106, + "loss": 3.5485, + "theoretical_loss": 4.598264049073565, + "tokens_seen": 131399680 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849348044132397, + "loss": 3.7149, + "theoretical_loss": 4.597958399809908, + "tokens_seen": 131465216 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048492477432296894, + "loss": 3.619, + "theoretical_loss": 4.59765294551449, + "tokens_seen": 131530752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849147442326981, + "loss": 3.4863, + "theoretical_loss": 4.597347685965897, + "tokens_seen": 131596288 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004849047141424273, + "loss": 3.3637, + "theoretical_loss": 4.597042620943069, + "tokens_seen": 131661824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848946840521565, + "loss": 3.6284, + "theoretical_loss": 4.596737750225311, + "tokens_seen": 131727360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848846539618857, + "loss": 3.5994, + "theoretical_loss": 4.596433073592289, + "tokens_seen": 131792896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048487462387161484, + "loss": 3.663, + "theoretical_loss": 4.596128590824026, + "tokens_seen": 131858432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848645937813441, + "loss": 3.4375, + "theoretical_loss": 4.595824301700904, + "tokens_seen": 131923968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848545636910732, + "loss": 3.6939, + "theoretical_loss": 4.595520206003663, + "tokens_seen": 131989504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048484453360080244, + "loss": 3.6163, + "theoretical_loss": 4.595216303513399, + "tokens_seen": 132055040 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848345035105316, + "loss": 3.656, + "theoretical_loss": 4.594912594011566, + "tokens_seen": 132120576 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004848244734202608, + "loss": 3.6303, + "theoretical_loss": 4.594609077279973, + "tokens_seen": 132186112 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048481444332999, + "loss": 3.5099, + "theoretical_loss": 4.594305753100782, + "tokens_seen": 132251648 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048480441323971917, + "loss": 3.682, + "theoretical_loss": 4.594002621256511, + "tokens_seen": 132317184 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048479438314944835, + "loss": 3.4167, + "theoretical_loss": 4.59369968153003, + "tokens_seen": 132382720 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847843530591776, + "loss": 3.5741, + "theoretical_loss": 4.593396933704562, + "tokens_seen": 132448256 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847743229689067, + "loss": 3.5787, + "theoretical_loss": 4.593094377563681, + "tokens_seen": 132513792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048476429287863594, + "loss": 3.6099, + "theoretical_loss": 4.592792012891314, + "tokens_seen": 132579328 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048475426278836507, + "loss": 3.5282, + "theoretical_loss": 4.592489839471735, + "tokens_seen": 132644864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 241377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6454577445983887, + "objective/train/theoretical_loss": 4.592187857089571, + "objective/train/tokens_used": 153170400, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847442326980943, + "loss": 3.6721, + "theoretical_loss": 4.592187857089571, + "tokens_seen": 132710400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847342026078235, + "loss": 3.5045, + "theoretical_loss": 4.591886065529795, + "tokens_seen": 132775936 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048472417251755267, + "loss": 3.5619, + "theoretical_loss": 4.591584464577728, + "tokens_seen": 132841472 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048471414242728185, + "loss": 3.3762, + "theoretical_loss": 4.591283054019041, + "tokens_seen": 132907008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004847041123370111, + "loss": 3.622, + "theoretical_loss": 4.5909818336397485, + "tokens_seen": 132972544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846940822467402, + "loss": 3.5857, + "theoretical_loss": 4.590680803226213, + "tokens_seen": 133038080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048468405215646945, + "loss": 3.6409, + "theoretical_loss": 4.590379962565141, + "tokens_seen": 133103616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846740220661986, + "loss": 3.7579, + "theoretical_loss": 4.590079311443583, + "tokens_seen": 133169152 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846639919759278, + "loss": 3.4171, + "theoretical_loss": 4.589778849648934, + "tokens_seen": 133234688 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484653961885657, + "loss": 3.5035, + "theoretical_loss": 4.589478576968932, + "tokens_seen": 133300224 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048464393179538617, + "loss": 3.5472, + "theoretical_loss": 4.589178493191655, + "tokens_seen": 133365760 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048463390170511535, + "loss": 3.6052, + "theoretical_loss": 4.588878598105527, + "tokens_seen": 133431296 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048462387161484453, + "loss": 3.5834, + "theoretical_loss": 4.588578891499308, + "tokens_seen": 133496832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004846138415245737, + "loss": 3.5189, + "theoretical_loss": 4.588279373162101, + "tokens_seen": 133562368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048460381143430295, + "loss": 3.5182, + "theoretical_loss": 4.587980042883347, + "tokens_seen": 133627904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845937813440321, + "loss": 3.665, + "theoretical_loss": 4.587680900452824, + "tokens_seen": 133693440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845837512537613, + "loss": 3.4734, + "theoretical_loss": 4.587381945660653, + "tokens_seen": 133758976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048457372116349044, + "loss": 3.4591, + "theoretical_loss": 4.587083178297288, + "tokens_seen": 133824512 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845636910732197, + "loss": 3.6534, + "theoretical_loss": 4.5867845981535185, + "tokens_seen": 133890048 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048455366098294886, + "loss": 3.7087, + "theoretical_loss": 4.586486205020474, + "tokens_seen": 133955584 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048454363089267804, + "loss": 3.6505, + "theoretical_loss": 4.586187998689616, + "tokens_seen": 134021120 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845336008024072, + "loss": 3.501, + "theoretical_loss": 4.585889978952741, + "tokens_seen": 134086656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048452357071213645, + "loss": 3.5385, + "theoretical_loss": 4.58559214560198, + "tokens_seen": 134152192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845135406218656, + "loss": 3.3732, + "theoretical_loss": 4.585294498429796, + "tokens_seen": 134217728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004845035105315948, + "loss": 3.4149, + "theoretical_loss": 4.584997037228986, + "tokens_seen": 134283264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 244333, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6224210262298584, + "objective/train/theoretical_loss": 4.584699761792674, + "objective/train/tokens_used": 154808800, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048449348044132394, + "loss": 3.5586, + "theoretical_loss": 4.584699761792674, + "tokens_seen": 134348800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844834503510532, + "loss": 3.5878, + "theoretical_loss": 4.5844026719143205, + "tokens_seen": 134414336 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048447342026078236, + "loss": 3.5422, + "theoretical_loss": 4.5841057673877135, + "tokens_seen": 134479872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048446339017051154, + "loss": 3.4814, + "theoretical_loss": 4.5838090480069695, + "tokens_seen": 134545408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844533600802407, + "loss": 3.5463, + "theoretical_loss": 4.5835125135665375, + "tokens_seen": 134610944 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844433299899699, + "loss": 3.4516, + "theoretical_loss": 4.583216163861191, + "tokens_seen": 134676480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844332998996991, + "loss": 3.4074, + "theoretical_loss": 4.58291999868603, + "tokens_seen": 134742016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844232698094283, + "loss": 3.4035, + "theoretical_loss": 4.582624017836489, + "tokens_seen": 134807552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048441323971915745, + "loss": 3.5022, + "theoretical_loss": 4.582328221108318, + "tokens_seen": 134873088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004844032096288867, + "loss": 3.525, + "theoretical_loss": 4.5820326082976, + "tokens_seen": 134938624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048439317953861586, + "loss": 3.5199, + "theoretical_loss": 4.581737179200739, + "tokens_seen": 135004160 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048438314944834504, + "loss": 3.5933, + "theoretical_loss": 4.581441933614466, + "tokens_seen": 135069696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843731193580742, + "loss": 3.6723, + "theoretical_loss": 4.581146871335832, + "tokens_seen": 135135232 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843630892678034, + "loss": 3.6993, + "theoretical_loss": 4.580851992162214, + "tokens_seen": 135200768 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843530591775326, + "loss": 3.4415, + "theoretical_loss": 4.5805572958913086, + "tokens_seen": 135266304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843430290872618, + "loss": 3.4204, + "theoretical_loss": 4.580262782321135, + "tokens_seen": 135331840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048433299899699095, + "loss": 3.4669, + "theoretical_loss": 4.579968451250032, + "tokens_seen": 135397376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843229689067202, + "loss": 3.6211, + "theoretical_loss": 4.579674302476661, + "tokens_seen": 135462912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843129388164493, + "loss": 3.6026, + "theoretical_loss": 4.579380335800001, + "tokens_seen": 135528448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048430290872617855, + "loss": 3.6153, + "theoretical_loss": 4.579086551019348, + "tokens_seen": 135593984 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048429287863590773, + "loss": 3.5763, + "theoretical_loss": 4.5787929479343195, + "tokens_seen": 135659520 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842828485456369, + "loss": 3.578, + "theoretical_loss": 4.578499526344848, + "tokens_seen": 135725056 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842728184553661, + "loss": 3.423, + "theoretical_loss": 4.578206286051184, + "tokens_seen": 135790592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048426278836509527, + "loss": 3.6017, + "theoretical_loss": 4.5779132268538945, + "tokens_seen": 135856128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048425275827482445, + "loss": 3.4435, + "theoretical_loss": 4.577620348553859, + "tokens_seen": 135921664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 247335, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.707146167755127, + "objective/train/theoretical_loss": 4.577327650952276, + "objective/train/tokens_used": 156447200, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842427281845537, + "loss": 3.6185, + "theoretical_loss": 4.577327650952276, + "tokens_seen": 135987200 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048423269809428287, + "loss": 3.7638, + "theoretical_loss": 4.5770351338506545, + "tokens_seen": 136052736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048422266800401205, + "loss": 3.4542, + "theoretical_loss": 4.57674279705082, + "tokens_seen": 136118272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842126379137413, + "loss": 3.5167, + "theoretical_loss": 4.57645064035491, + "tokens_seen": 136183808 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842026078234704, + "loss": 3.5904, + "theoretical_loss": 4.576158663565371, + "tokens_seen": 136249344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048419257773319965, + "loss": 3.7094, + "theoretical_loss": 4.575866866484967, + "tokens_seen": 136314880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841825476429288, + "loss": 3.6926, + "theoretical_loss": 4.575575248916767, + "tokens_seen": 136380416 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484172517552658, + "loss": 3.492, + "theoretical_loss": 4.575283810664155, + "tokens_seen": 136445952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841624874623872, + "loss": 3.4407, + "theoretical_loss": 4.574992551530822, + "tokens_seen": 136511488 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048415245737211637, + "loss": 3.5272, + "theoretical_loss": 4.574701471320768, + "tokens_seen": 136577024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048414242728184555, + "loss": 3.5863, + "theoretical_loss": 4.574410569838304, + "tokens_seen": 136642560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048413239719157473, + "loss": 3.6504, + "theoretical_loss": 4.574119846888045, + "tokens_seen": 136708096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841223671013039, + "loss": 3.4721, + "theoretical_loss": 4.573829302274915, + "tokens_seen": 136773632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048411233701103315, + "loss": 3.2975, + "theoretical_loss": 4.573538935804146, + "tokens_seen": 136839168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004841023069207623, + "loss": 3.5323, + "theoretical_loss": 4.573248747281273, + "tokens_seen": 136904704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840922768304915, + "loss": 3.6547, + "theoretical_loss": 4.5729587365121365, + "tokens_seen": 136970240 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048408224674022064, + "loss": 3.7289, + "theoretical_loss": 4.572668903302886, + "tokens_seen": 137035776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840722166499499, + "loss": 3.6803, + "theoretical_loss": 4.572379247459969, + "tokens_seen": 137101312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048406218655967906, + "loss": 3.6031, + "theoretical_loss": 4.57208976879014, + "tokens_seen": 137166848 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048405215646940824, + "loss": 3.5582, + "theoretical_loss": 4.571800467100456, + "tokens_seen": 137232384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840421263791374, + "loss": 3.5632, + "theoretical_loss": 4.5715113421982725, + "tokens_seen": 137297920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048403209628886665, + "loss": 3.5551, + "theoretical_loss": 4.571222393891253, + "tokens_seen": 137363456 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004840220661985958, + "loss": 3.5667, + "theoretical_loss": 4.570933621987356, + "tokens_seen": 137428992 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484012036108325, + "loss": 3.4934, + "theoretical_loss": 4.570645026294844, + "tokens_seen": 137494528 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048400200601805414, + "loss": 3.6178, + "theoretical_loss": 4.570356606622278, + "tokens_seen": 137560064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 249317, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7875092029571533, + "objective/train/theoretical_loss": 4.570068362778516, + "objective/train/tokens_used": 158085600, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839919759277834, + "loss": 3.6262, + "theoretical_loss": 4.570068362778516, + "tokens_seen": 137625600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048398194583751256, + "loss": 3.5846, + "theoretical_loss": 4.569780294572718, + "tokens_seen": 137691136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048397191574724174, + "loss": 3.688, + "theoretical_loss": 4.569492401814339, + "tokens_seen": 137756672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839618856569709, + "loss": 3.4855, + "theoretical_loss": 4.569204684313133, + "tokens_seen": 137822208 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839518555667001, + "loss": 3.5591, + "theoretical_loss": 4.568917141879149, + "tokens_seen": 137887744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839418254764293, + "loss": 3.6552, + "theoretical_loss": 4.568629774322736, + "tokens_seen": 137953280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839317953861585, + "loss": 3.5707, + "theoretical_loss": 4.568342581454532, + "tokens_seen": 138018816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048392176529588765, + "loss": 3.6346, + "theoretical_loss": 4.568055563085476, + "tokens_seen": 138084352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839117352056169, + "loss": 3.6634, + "theoretical_loss": 4.567768719026797, + "tokens_seen": 138149888 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048390170511534606, + "loss": 3.4294, + "theoretical_loss": 4.567482049090019, + "tokens_seen": 138215424 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048389167502507524, + "loss": 3.6086, + "theoretical_loss": 4.567195553086961, + "tokens_seen": 138280960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838816449348044, + "loss": 3.6078, + "theoretical_loss": 4.566909230829729, + "tokens_seen": 138346496 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838716148445336, + "loss": 3.5953, + "theoretical_loss": 4.566623082130729, + "tokens_seen": 138412032 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838615847542628, + "loss": 3.5194, + "theoretical_loss": 4.566337106802651, + "tokens_seen": 138477568 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483851554663992, + "loss": 3.4984, + "theoretical_loss": 4.56605130465848, + "tokens_seen": 138543104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048384152457372115, + "loss": 3.6139, + "theoretical_loss": 4.565765675511487, + "tokens_seen": 138608640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838314944834504, + "loss": 3.4343, + "theoretical_loss": 4.565480219175237, + "tokens_seen": 138674176 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004838214643931795, + "loss": 3.6232, + "theoretical_loss": 4.56519493546358, + "tokens_seen": 138739712 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048381143430290875, + "loss": 3.4284, + "theoretical_loss": 4.56490982419066, + "tokens_seen": 138805248 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048380140421263793, + "loss": 3.5257, + "theoretical_loss": 4.564624885170902, + "tokens_seen": 138870784 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837913741223671, + "loss": 3.5903, + "theoretical_loss": 4.564340118219022, + "tokens_seen": 138936320 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837813440320963, + "loss": 3.3378, + "theoretical_loss": 4.56405552315002, + "tokens_seen": 139001856 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048377131394182547, + "loss": 3.7357, + "theoretical_loss": 4.563771099779187, + "tokens_seen": 139067392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048376128385155465, + "loss": 3.6558, + "theoretical_loss": 4.563486847922093, + "tokens_seen": 139132928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837512537612839, + "loss": 3.6479, + "theoretical_loss": 4.563202767394597, + "tokens_seen": 139198464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 252191, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.8185887336730957, + "objective/train/theoretical_loss": 4.562918858012843, + "objective/train/tokens_used": 159724000, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483741223671013, + "loss": 3.7075, + "theoretical_loss": 4.562918858012843, + "tokens_seen": 139264000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048373119358074225, + "loss": 3.6562, + "theoretical_loss": 4.562635119593255, + "tokens_seen": 139329536 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048372116349047143, + "loss": 3.3579, + "theoretical_loss": 4.562351551952542, + "tokens_seen": 139395072 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837111334002006, + "loss": 3.4886, + "theoretical_loss": 4.5620681549076965, + "tokens_seen": 139460608 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004837011033099298, + "loss": 3.5397, + "theoretical_loss": 4.561784928275992, + "tokens_seen": 139526144 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483691073219659, + "loss": 3.5268, + "theoretical_loss": 4.561501871874984, + "tokens_seen": 139591680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048368104312938816, + "loss": 3.544, + "theoretical_loss": 4.561218985522507, + "tokens_seen": 139657216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836710130391174, + "loss": 3.4519, + "theoretical_loss": 4.560936269036679, + "tokens_seen": 139722752 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836609829488465, + "loss": 3.5463, + "theoretical_loss": 4.560653722235895, + "tokens_seen": 139788288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048365095285857575, + "loss": 3.6003, + "theoretical_loss": 4.560371344938831, + "tokens_seen": 139853824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836409227683049, + "loss": 3.634, + "theoretical_loss": 4.560089136964439, + "tokens_seen": 139919360 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836308926780341, + "loss": 3.6618, + "theoretical_loss": 4.559807098131953, + "tokens_seen": 139984896 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836208625877633, + "loss": 3.5859, + "theoretical_loss": 4.559525228260882, + "tokens_seen": 140050432 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836108324974925, + "loss": 3.5828, + "theoretical_loss": 4.559243527171011, + "tokens_seen": 140115968 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048360080240722166, + "loss": 3.5559, + "theoretical_loss": 4.558961994682403, + "tokens_seen": 140181504 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048359077231695084, + "loss": 3.6583, + "theoretical_loss": 4.558680630615397, + "tokens_seen": 140247040 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048358074222668, + "loss": 3.497, + "theoretical_loss": 4.558399434790607, + "tokens_seen": 140312576 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048357071213640926, + "loss": 3.6507, + "theoretical_loss": 4.558118407028921, + "tokens_seen": 140378112 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835606820461384, + "loss": 3.5798, + "theoretical_loss": 4.557837547151502, + "tokens_seen": 140443648 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835506519558676, + "loss": 3.7075, + "theoretical_loss": 4.557556854979786, + "tokens_seen": 140509184 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835406218655968, + "loss": 3.4796, + "theoretical_loss": 4.5572763303354815, + "tokens_seen": 140574720 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483530591775326, + "loss": 3.516, + "theoretical_loss": 4.556995973040574, + "tokens_seen": 140640256 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048352056168505516, + "loss": 3.45, + "theoretical_loss": 4.556715782917314, + "tokens_seen": 140705792 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048351053159478434, + "loss": 3.5895, + "theoretical_loss": 4.556435759788229, + "tokens_seen": 140771328 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004835005015045135, + "loss": 3.5183, + "theoretical_loss": 4.556155903476114, + "tokens_seen": 140836864 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 255032, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3441946506500244, + "objective/train/theoretical_loss": 4.555876213804037, + "objective/train/tokens_used": 161362400, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048349047141424276, + "loss": 3.5251, + "theoretical_loss": 4.555876213804037, + "tokens_seen": 140902400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048348044132397194, + "loss": 3.5062, + "theoretical_loss": 4.555596690595333, + "tokens_seen": 140967936 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834704112337011, + "loss": 3.6459, + "theoretical_loss": 4.555317333673611, + "tokens_seen": 141033472 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834603811434303, + "loss": 3.5418, + "theoretical_loss": 4.555038142862742, + "tokens_seen": 141099008 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834503510531595, + "loss": 3.6024, + "theoretical_loss": 4.5547591179868725, + "tokens_seen": 141164544 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834403209628887, + "loss": 3.636, + "theoretical_loss": 4.554480258870409, + "tokens_seen": 141230080 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048343029087261785, + "loss": 3.5015, + "theoretical_loss": 4.554201565338033, + "tokens_seen": 141295616 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004834202607823471, + "loss": 3.3699, + "theoretical_loss": 4.5539230372146875, + "tokens_seen": 141361152 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048341023069207626, + "loss": 3.4566, + "theoretical_loss": 4.553644674325584, + "tokens_seen": 141426688 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048340020060180544, + "loss": 3.5625, + "theoretical_loss": 4.553366476496198, + "tokens_seen": 141492224 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833901705115346, + "loss": 3.5699, + "theoretical_loss": 4.553088443552269, + "tokens_seen": 141557760 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833801404212638, + "loss": 3.5521, + "theoretical_loss": 4.552810575319806, + "tokens_seen": 141623296 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483370110330993, + "loss": 3.6214, + "theoretical_loss": 4.552532871625077, + "tokens_seen": 141688832 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833600802407222, + "loss": 3.3883, + "theoretical_loss": 4.5522553322946155, + "tokens_seen": 141754368 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048335005015045135, + "loss": 3.5569, + "theoretical_loss": 4.551977957155217, + "tokens_seen": 141819904 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833400200601806, + "loss": 3.5735, + "theoretical_loss": 4.5517007460339425, + "tokens_seen": 141885440 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833299899699097, + "loss": 3.7778, + "theoretical_loss": 4.551423698758111, + "tokens_seen": 141950976 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048331995987963895, + "loss": 3.6275, + "theoretical_loss": 4.551146815155304, + "tokens_seen": 142016512 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048330992978936813, + "loss": 3.5044, + "theoretical_loss": 4.550870095053366, + "tokens_seen": 142082048 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832998996990973, + "loss": 3.5791, + "theoretical_loss": 4.550593538280398, + "tokens_seen": 142147584 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832898696088265, + "loss": 3.6546, + "theoretical_loss": 4.550317144664766, + "tokens_seen": 142213120 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048327983951855567, + "loss": 3.5328, + "theoretical_loss": 4.55004091403509, + "tokens_seen": 142278656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048326980942828485, + "loss": 3.4194, + "theoretical_loss": 4.5497648462202545, + "tokens_seen": 142344192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832597793380141, + "loss": 3.4379, + "theoretical_loss": 4.549488941049397, + "tokens_seen": 142409728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832497492477432, + "loss": 3.5957, + "theoretical_loss": 4.549213198351914, + "tokens_seen": 142475264 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 257777, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7964699268341064, + "objective/train/theoretical_loss": 4.548937617957463, + "objective/train/tokens_used": 163000800, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048323971915747245, + "loss": 3.5548, + "theoretical_loss": 4.548937617957463, + "tokens_seen": 142540800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048322968906720163, + "loss": 3.5506, + "theoretical_loss": 4.548662199695954, + "tokens_seen": 142606336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004832196589769308, + "loss": 3.6283, + "theoretical_loss": 4.548386943397556, + "tokens_seen": 142671872 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048320962888666, + "loss": 3.5598, + "theoretical_loss": 4.548111848892693, + "tokens_seen": 142737408 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831995987963892, + "loss": 3.6459, + "theoretical_loss": 4.547836916012042, + "tokens_seen": 142802944 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048318956870611836, + "loss": 3.4983, + "theoretical_loss": 4.547562144586539, + "tokens_seen": 142868480 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831795386158476, + "loss": 3.7062, + "theoretical_loss": 4.547287534447372, + "tokens_seen": 142934016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831695085255767, + "loss": 3.491, + "theoretical_loss": 4.5470130854259825, + "tokens_seen": 142999552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048315947843530595, + "loss": 3.7054, + "theoretical_loss": 4.546738797354065, + "tokens_seen": 143065088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831494483450351, + "loss": 3.516, + "theoretical_loss": 4.546464670063569, + "tokens_seen": 143130624 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831394182547643, + "loss": 3.5269, + "theoretical_loss": 4.546190703386695, + "tokens_seen": 143196160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831293881644935, + "loss": 3.5203, + "theoretical_loss": 4.545916897155894, + "tokens_seen": 143261696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004831193580742227, + "loss": 3.5303, + "theoretical_loss": 4.54564325120387, + "tokens_seen": 143327232 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048310932798395186, + "loss": 3.5105, + "theoretical_loss": 4.545369765363578, + "tokens_seen": 143392768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048309929789368104, + "loss": 3.5217, + "theoretical_loss": 4.545096439468223, + "tokens_seen": 143458304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830892678034102, + "loss": 3.628, + "theoretical_loss": 4.544823273351257, + "tokens_seen": 143523840 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048307923771313946, + "loss": 3.5657, + "theoretical_loss": 4.544550266846388, + "tokens_seen": 143589376 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830692076228686, + "loss": 3.4634, + "theoretical_loss": 4.544277419787566, + "tokens_seen": 143654912 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830591775325978, + "loss": 3.3823, + "theoretical_loss": 4.544004732008993, + "tokens_seen": 143720448 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483049147442327, + "loss": 3.5854, + "theoretical_loss": 4.543732203345119, + "tokens_seen": 143785984 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830391173520562, + "loss": 3.4976, + "theoretical_loss": 4.543459833630639, + "tokens_seen": 143851520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048302908726178536, + "loss": 3.4834, + "theoretical_loss": 4.543187622700497, + "tokens_seen": 143917056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048301905717151454, + "loss": 3.6058, + "theoretical_loss": 4.542915570389884, + "tokens_seen": 143982592 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004830090270812437, + "loss": 3.4514, + "theoretical_loss": 4.542643676534234, + "tokens_seen": 144048128 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048299899699097296, + "loss": 3.5528, + "theoretical_loss": 4.542371940969231, + "tokens_seen": 144113664 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 259161, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7806003093719482, + "objective/train/theoretical_loss": 4.542100363530799, + "objective/train/tokens_used": 164639200, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829889669007021, + "loss": 3.5574, + "theoretical_loss": 4.542100363530799, + "tokens_seen": 144179200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829789368104313, + "loss": 3.5528, + "theoretical_loss": 4.54182894405511, + "tokens_seen": 144244736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048296890672016045, + "loss": 3.5323, + "theoretical_loss": 4.5415576823785795, + "tokens_seen": 144310272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829588766298897, + "loss": 3.6577, + "theoretical_loss": 4.541286578337866, + "tokens_seen": 144375808 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048294884653961887, + "loss": 3.6006, + "theoretical_loss": 4.541015631769872, + "tokens_seen": 144441344 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048293881644934805, + "loss": 3.5063, + "theoretical_loss": 4.5407448425117405, + "tokens_seen": 144506880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048292878635907723, + "loss": 3.4028, + "theoretical_loss": 4.540474210400859, + "tokens_seen": 144572416 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048291875626880646, + "loss": 3.6684, + "theoretical_loss": 4.540203735274855, + "tokens_seen": 144637952 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829087261785356, + "loss": 3.4132, + "theoretical_loss": 4.5399334169716, + "tokens_seen": 144703488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828986960882648, + "loss": 3.5536, + "theoretical_loss": 4.539663255329202, + "tokens_seen": 144769024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048288866599799395, + "loss": 3.3662, + "theoretical_loss": 4.539393250186015, + "tokens_seen": 144834560 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828786359077232, + "loss": 3.4983, + "theoretical_loss": 4.539123401380625, + "tokens_seen": 144900096 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048286860581745237, + "loss": 3.5839, + "theoretical_loss": 4.538853708751866, + "tokens_seen": 144965632 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048285857572718155, + "loss": 3.4721, + "theoretical_loss": 4.538584172138804, + "tokens_seen": 145031168 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048284854563691073, + "loss": 3.4941, + "theoretical_loss": 4.538314791380748, + "tokens_seen": 145096704 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828385155466399, + "loss": 3.5129, + "theoretical_loss": 4.538045566317242, + "tokens_seen": 145162240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828284854563691, + "loss": 3.4246, + "theoretical_loss": 4.537776496788071, + "tokens_seen": 145227776 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048281845536609833, + "loss": 3.5131, + "theoretical_loss": 4.537507582633253, + "tokens_seen": 145293312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048280842527582746, + "loss": 3.4435, + "theoretical_loss": 4.537238823693045, + "tokens_seen": 145358848 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827983951855567, + "loss": 3.5818, + "theoretical_loss": 4.536970219807939, + "tokens_seen": 145424384 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827883650952858, + "loss": 3.6428, + "theoretical_loss": 4.536701770818665, + "tokens_seen": 145489920 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048277833500501505, + "loss": 3.387, + "theoretical_loss": 4.536433476566185, + "tokens_seen": 145555456 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048276830491474423, + "loss": 3.3432, + "theoretical_loss": 4.536165336891699, + "tokens_seen": 145620992 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827582748244734, + "loss": 3.4649, + "theoretical_loss": 4.535897351636638, + "tokens_seen": 145686528 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827482447342026, + "loss": 3.3971, + "theoretical_loss": 4.53562952064267, + "tokens_seen": 145752064 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 261926, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4912607669830322, + "objective/train/theoretical_loss": 4.535361843751696, + "objective/train/tokens_used": 166277600, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048273821464393183, + "loss": 3.4381, + "theoretical_loss": 4.535361843751696, + "tokens_seen": 145817600 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482728184553661, + "loss": 3.4382, + "theoretical_loss": 4.535094320805847, + "tokens_seen": 145883136 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827181544633902, + "loss": 3.446, + "theoretical_loss": 4.534826951647489, + "tokens_seen": 145948672 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827081243731194, + "loss": 3.6561, + "theoretical_loss": 4.5345597361192205, + "tokens_seen": 146014208 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048269809428284856, + "loss": 3.422, + "theoretical_loss": 4.53429267406387, + "tokens_seen": 146079744 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826880641925778, + "loss": 3.5571, + "theoretical_loss": 4.5340257653244995, + "tokens_seen": 146145280 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826780341023069, + "loss": 3.4954, + "theoretical_loss": 4.5337590097444, + "tokens_seen": 146210816 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048266800401203615, + "loss": 3.6325, + "theoretical_loss": 4.533492407167093, + "tokens_seen": 146276352 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826579739217653, + "loss": 3.4401, + "theoretical_loss": 4.53322595743633, + "tokens_seen": 146341888 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826479438314945, + "loss": 3.3296, + "theoretical_loss": 4.5329596603960916, + "tokens_seen": 146407424 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826379137412237, + "loss": 3.514, + "theoretical_loss": 4.53269351589059, + "tokens_seen": 146472960 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004826278836509529, + "loss": 3.4025, + "theoretical_loss": 4.532427523764261, + "tokens_seen": 146538496 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048261785356068206, + "loss": 3.5781, + "theoretical_loss": 4.532161683861773, + "tokens_seen": 146604032 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048260782347041124, + "loss": 3.398, + "theoretical_loss": 4.5318959960280205, + "tokens_seen": 146669568 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825977933801404, + "loss": 3.4666, + "theoretical_loss": 4.531630460108125, + "tokens_seen": 146735104 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048258776328986966, + "loss": 3.5678, + "theoretical_loss": 4.531365075947434, + "tokens_seen": 146800640 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825777331995988, + "loss": 3.6149, + "theoretical_loss": 4.531099843391524, + "tokens_seen": 146866176 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482567703109328, + "loss": 3.5055, + "theoretical_loss": 4.5308347622861955, + "tokens_seen": 146931712 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825576730190572, + "loss": 3.4086, + "theoretical_loss": 4.5305698324774735, + "tokens_seen": 146997248 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825476429287864, + "loss": 3.6714, + "theoretical_loss": 4.530305053811611, + "tokens_seen": 147062784 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048253761283851556, + "loss": 3.5554, + "theoretical_loss": 4.530040426135084, + "tokens_seen": 147128320 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048252758274824474, + "loss": 3.5783, + "theoretical_loss": 4.529775949294593, + "tokens_seen": 147193856 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825175526579739, + "loss": 3.4797, + "theoretical_loss": 4.529511623137061, + "tokens_seen": 147259392 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048250752256770316, + "loss": 3.6243, + "theoretical_loss": 4.529247447509637, + "tokens_seen": 147324928 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824974924774323, + "loss": 3.5696, + "theoretical_loss": 4.528983422259691, + "tokens_seen": 147390464 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 264882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.608053207397461, + "objective/train/theoretical_loss": 4.528719547234816, + "objective/train/tokens_used": 167916000, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824874623871615, + "loss": 3.3995, + "theoretical_loss": 4.528719547234816, + "tokens_seen": 147456000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048247743229689065, + "loss": 3.4477, + "theoretical_loss": 4.528455822282828, + "tokens_seen": 147521536 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824674022066199, + "loss": 3.6145, + "theoretical_loss": 4.528192247251763, + "tokens_seen": 147587072 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048245737211634907, + "loss": 3.5882, + "theoretical_loss": 4.52792882198988, + "tokens_seen": 147652608 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048244734202607825, + "loss": 3.4512, + "theoretical_loss": 4.527665546345656, + "tokens_seen": 147718144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048243731193580743, + "loss": 3.4127, + "theoretical_loss": 4.5274024201677925, + "tokens_seen": 147783680 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048242728184553666, + "loss": 3.5336, + "theoretical_loss": 4.527139443305209, + "tokens_seen": 147849216 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824172517552658, + "loss": 3.5606, + "theoretical_loss": 4.526876615607042, + "tokens_seen": 147914752 + }, + { + "epoch": 0.04, + "learning_rate": 0.000482407221664995, + "loss": 3.4403, + "theoretical_loss": 4.526613936922654, + "tokens_seen": 147980288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048239719157472415, + "loss": 3.4868, + "theoretical_loss": 4.526351407101618, + "tokens_seen": 148045824 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823871614844534, + "loss": 3.5503, + "theoretical_loss": 4.526089025993732, + "tokens_seen": 148111360 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048237713139418257, + "loss": 3.4642, + "theoretical_loss": 4.525826793449008, + "tokens_seen": 148176896 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048236710130391175, + "loss": 3.5045, + "theoretical_loss": 4.525564709317678, + "tokens_seen": 148242432 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048235707121364093, + "loss": 3.5817, + "theoretical_loss": 4.525302773450187, + "tokens_seen": 148307968 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823470411233701, + "loss": 3.4603, + "theoretical_loss": 4.525040985697203, + "tokens_seen": 148373504 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004823370110330993, + "loss": 3.665, + "theoretical_loss": 4.524779345909604, + "tokens_seen": 148439040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048232698094282853, + "loss": 3.4725, + "theoretical_loss": 4.524517853938489, + "tokens_seen": 148504576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048231695085255766, + "loss": 3.2674, + "theoretical_loss": 4.524256509635169, + "tokens_seen": 148570112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004823069207622869, + "loss": 3.5503, + "theoretical_loss": 4.523995312851174, + "tokens_seen": 148635648 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482296890672016, + "loss": 3.4631, + "theoretical_loss": 4.523734263438241, + "tokens_seen": 148701184 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048228686058174525, + "loss": 3.626, + "theoretical_loss": 4.52347336124833, + "tokens_seen": 148766720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048227683049147443, + "loss": 3.5029, + "theoretical_loss": 4.52321260613361, + "tokens_seen": 148832256 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822668004012036, + "loss": 3.5698, + "theoretical_loss": 4.522951997946466, + "tokens_seen": 148897792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822567703109328, + "loss": 3.4782, + "theoretical_loss": 4.522691536539492, + "tokens_seen": 148963328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048224674022066203, + "loss": 3.4374, + "theoretical_loss": 4.522431221765498, + "tokens_seen": 149028864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 267653, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5719988346099854, + "objective/train/theoretical_loss": 4.522171053477507, + "objective/train/tokens_used": 169554400, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048223671013039116, + "loss": 3.5976, + "theoretical_loss": 4.522171053477507, + "tokens_seen": 149094400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822266800401204, + "loss": 3.5702, + "theoretical_loss": 4.5219110315287505, + "tokens_seen": 149159936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822166499498495, + "loss": 3.5773, + "theoretical_loss": 4.521651155772675, + "tokens_seen": 149225472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048220661985957876, + "loss": 3.531, + "theoretical_loss": 4.521391426062934, + "tokens_seen": 149291008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048219658976930794, + "loss": 3.6295, + "theoretical_loss": 4.521131842253396, + "tokens_seen": 149356544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821865596790371, + "loss": 3.4085, + "theoretical_loss": 4.520872404198139, + "tokens_seen": 149422080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821765295887663, + "loss": 3.383, + "theoretical_loss": 4.520613111751445, + "tokens_seen": 149487616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821664994984955, + "loss": 3.4309, + "theoretical_loss": 4.520353964767814, + "tokens_seen": 149553152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048215646940822466, + "loss": 3.5164, + "theoretical_loss": 4.5200949631019505, + "tokens_seen": 149618688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821464393179539, + "loss": 3.3566, + "theoretical_loss": 4.519836106608768, + "tokens_seen": 149684224 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482136409227683, + "loss": 3.4719, + "theoretical_loss": 4.519577395143388, + "tokens_seen": 149749760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048212637913741226, + "loss": 3.4971, + "theoretical_loss": 4.519318828561142, + "tokens_seen": 149815296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821163490471414, + "loss": 3.504, + "theoretical_loss": 4.519060406717565, + "tokens_seen": 149880832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004821063189568706, + "loss": 3.5674, + "theoretical_loss": 4.518802129468405, + "tokens_seen": 149946368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820962888665998, + "loss": 3.5695, + "theoretical_loss": 4.51854399666961, + "tokens_seen": 150011904 + }, + { + "epoch": 0.05, + "learning_rate": 0.000482086258776329, + "loss": 3.5163, + "theoretical_loss": 4.518286008177341, + "tokens_seen": 150077440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048207622868605816, + "loss": 3.3885, + "theoretical_loss": 4.51802816384796, + "tokens_seen": 150142976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820661985957874, + "loss": 3.4805, + "theoretical_loss": 4.517770463538038, + "tokens_seen": 150208512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048205616850551653, + "loss": 3.5288, + "theoretical_loss": 4.517512907104347, + "tokens_seen": 150274048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048204613841524576, + "loss": 3.4368, + "theoretical_loss": 4.517255494403868, + "tokens_seen": 150339584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820361083249749, + "loss": 3.5381, + "theoretical_loss": 4.516998225293785, + "tokens_seen": 150405120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820260782347041, + "loss": 3.474, + "theoretical_loss": 4.516741099631485, + "tokens_seen": 150470656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820160481444333, + "loss": 3.565, + "theoretical_loss": 4.51648411727456, + "tokens_seen": 150536192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820060180541625, + "loss": 3.3583, + "theoretical_loss": 4.5162272780808035, + "tokens_seen": 150601728 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048199598796389167, + "loss": 3.4793, + "theoretical_loss": 4.515970581908216, + "tokens_seen": 150667264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 270469, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4180386066436768, + "objective/train/theoretical_loss": 4.515714028614996, + "objective/train/tokens_used": 171192800, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048198595787362085, + "loss": 3.4769, + "theoretical_loss": 4.515714028614996, + "tokens_seen": 150732800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819759277833501, + "loss": 3.3997, + "theoretical_loss": 4.515457618059546, + "tokens_seen": 150798336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048196589769307927, + "loss": 3.6109, + "theoretical_loss": 4.515201350100471, + "tokens_seen": 150863872 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048195586760280845, + "loss": 3.4822, + "theoretical_loss": 4.514945224596577, + "tokens_seen": 150929408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048194583751253763, + "loss": 3.5377, + "theoretical_loss": 4.5146892414068684, + "tokens_seen": 150994944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048193580742226686, + "loss": 3.4778, + "theoretical_loss": 4.514433400390554, + "tokens_seen": 151060480 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481925777331996, + "loss": 3.3808, + "theoretical_loss": 4.514177701407042, + "tokens_seen": 151126016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004819157472417252, + "loss": 3.4892, + "theoretical_loss": 4.51392214431594, + "tokens_seen": 151191552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048190571715145435, + "loss": 3.6156, + "theoretical_loss": 4.513666728977054, + "tokens_seen": 151257088 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818956870611836, + "loss": 3.5852, + "theoretical_loss": 4.51341145525039, + "tokens_seen": 151322624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048188565697091277, + "loss": 3.2921, + "theoretical_loss": 4.513156322996155, + "tokens_seen": 151388160 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048187562688064195, + "loss": 3.5636, + "theoretical_loss": 4.512901332074751, + "tokens_seen": 151453696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048186559679037113, + "loss": 3.4103, + "theoretical_loss": 4.5126464823467805, + "tokens_seen": 151519232 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818555667001003, + "loss": 3.2773, + "theoretical_loss": 4.512391773673042, + "tokens_seen": 151584768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818455366098295, + "loss": 3.3903, + "theoretical_loss": 4.5121372059145335, + "tokens_seen": 151650304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048183550651955873, + "loss": 3.5801, + "theoretical_loss": 4.511882778932447, + "tokens_seen": 151715840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048182547642928786, + "loss": 3.3816, + "theoretical_loss": 4.511628492588174, + "tokens_seen": 151781376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818154463390171, + "loss": 3.5457, + "theoretical_loss": 4.5113743467433, + "tokens_seen": 151846912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004818054162487462, + "loss": 3.4334, + "theoretical_loss": 4.511120341259608, + "tokens_seen": 151912448 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048179538615847545, + "loss": 3.4159, + "theoretical_loss": 4.510866475999077, + "tokens_seen": 151977984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048178535606820463, + "loss": 3.4603, + "theoretical_loss": 4.510612750823878, + "tokens_seen": 152043520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817753259779338, + "loss": 3.5038, + "theoretical_loss": 4.5103591655963795, + "tokens_seen": 152109056 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481765295887663, + "loss": 3.5486, + "theoretical_loss": 4.510105720179144, + "tokens_seen": 152174592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048175526579739223, + "loss": 3.4646, + "theoretical_loss": 4.5098524144349295, + "tokens_seen": 152240128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048174523570712136, + "loss": 3.5994, + "theoretical_loss": 4.509599248226683, + "tokens_seen": 152305664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 273356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5088133811950684, + "objective/train/theoretical_loss": 4.509346221417552, + "objective/train/tokens_used": 172831200, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817352056168506, + "loss": 3.522, + "theoretical_loss": 4.509346221417552, + "tokens_seen": 152371200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004817251755265797, + "loss": 3.3607, + "theoretical_loss": 4.509093333870869, + "tokens_seen": 152436736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048171514543630896, + "loss": 3.5121, + "theoretical_loss": 4.508840585450166, + "tokens_seen": 152502272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048170511534603814, + "loss": 3.4916, + "theoretical_loss": 4.508587976019164, + "tokens_seen": 152567808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816950852557673, + "loss": 3.4637, + "theoretical_loss": 4.508335505441774, + "tokens_seen": 152633344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816850551654965, + "loss": 3.5384, + "theoretical_loss": 4.508083173582105, + "tokens_seen": 152698880 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816750250752257, + "loss": 3.4652, + "theoretical_loss": 4.507830980304451, + "tokens_seen": 152764416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048166499498495486, + "loss": 3.4346, + "theoretical_loss": 4.5075789254733, + "tokens_seen": 152829952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816549648946841, + "loss": 3.5889, + "theoretical_loss": 4.507327008953329, + "tokens_seen": 152895488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816449348044132, + "loss": 3.3827, + "theoretical_loss": 4.507075230609407, + "tokens_seen": 152961024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048163490471414246, + "loss": 3.4468, + "theoretical_loss": 4.506823590306591, + "tokens_seen": 153026560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816248746238716, + "loss": 3.4763, + "theoretical_loss": 4.506572087910127, + "tokens_seen": 153092096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816148445336008, + "loss": 3.4078, + "theoretical_loss": 4.506320723285455, + "tokens_seen": 153157632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048160481444333, + "loss": 3.6201, + "theoretical_loss": 4.506069496298198, + "tokens_seen": 153223168 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815947843530592, + "loss": 3.4768, + "theoretical_loss": 4.5058184068141705, + "tokens_seen": 153288704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048158475426278837, + "loss": 3.4891, + "theoretical_loss": 4.505567454699373, + "tokens_seen": 153354240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815747241725176, + "loss": 3.5196, + "theoretical_loss": 4.505316639819997, + "tokens_seen": 153419776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048156469408224673, + "loss": 3.5861, + "theoretical_loss": 4.505065962042418, + "tokens_seen": 153485312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048155466399197596, + "loss": 3.3749, + "theoretical_loss": 4.504815421233202, + "tokens_seen": 153550848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815446339017051, + "loss": 3.5018, + "theoretical_loss": 4.504565017259097, + "tokens_seen": 153616384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815346038114343, + "loss": 3.5013, + "theoretical_loss": 4.504314749987044, + "tokens_seen": 153681920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815245737211635, + "loss": 3.606, + "theoretical_loss": 4.504064619284163, + "tokens_seen": 153747456 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004815145436308927, + "loss": 3.4822, + "theoretical_loss": 4.503814625017766, + "tokens_seen": 153812992 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048150451354062187, + "loss": 3.5306, + "theoretical_loss": 4.5035647670553445, + "tokens_seen": 153878528 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048149448345035105, + "loss": 3.6028, + "theoretical_loss": 4.503315045264581, + "tokens_seen": 153944064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 275667, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.871817111968994, + "objective/train/theoretical_loss": 4.503065459513339, + "objective/train/tokens_used": 174469600, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048148445336008023, + "loss": 3.5554, + "theoretical_loss": 4.503065459513339, + "tokens_seen": 154009600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048147442326980947, + "loss": 3.5947, + "theoretical_loss": 4.502816009669665, + "tokens_seen": 154075136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814643931795386, + "loss": 3.5241, + "theoretical_loss": 4.502566695601795, + "tokens_seen": 154140672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048145436308926783, + "loss": 3.4878, + "theoretical_loss": 4.502317517178142, + "tokens_seen": 154206208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048144433299899696, + "loss": 3.5252, + "theoretical_loss": 4.502068474267309, + "tokens_seen": 154271744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004814343029087262, + "loss": 3.7918, + "theoretical_loss": 4.501819566738076, + "tokens_seen": 154337280 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048142427281845537, + "loss": 3.5565, + "theoretical_loss": 4.501570794459411, + "tokens_seen": 154402816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048141424272818455, + "loss": 3.4284, + "theoretical_loss": 4.501322157300461, + "tokens_seen": 154468352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048140421263791373, + "loss": 3.3857, + "theoretical_loss": 4.501073655130554, + "tokens_seen": 154533888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048139418254764297, + "loss": 3.4512, + "theoretical_loss": 4.500825287819205, + "tokens_seen": 154599424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813841524573721, + "loss": 3.4252, + "theoretical_loss": 4.500577055236104, + "tokens_seen": 154664960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048137412236710133, + "loss": 3.4851, + "theoretical_loss": 4.500328957251128, + "tokens_seen": 154730496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048136409227683046, + "loss": 3.4703, + "theoretical_loss": 4.500080993734329, + "tokens_seen": 154796032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813540621865597, + "loss": 3.4669, + "theoretical_loss": 4.499833164555944, + "tokens_seen": 154861568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813440320962889, + "loss": 3.5337, + "theoretical_loss": 4.499585469586387, + "tokens_seen": 154927104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048133400200601806, + "loss": 3.4233, + "theoretical_loss": 4.499337908696255, + "tokens_seen": 154992640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048132397191574724, + "loss": 3.542, + "theoretical_loss": 4.499090481756321, + "tokens_seen": 155058176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813139418254764, + "loss": 3.4362, + "theoretical_loss": 4.498843188637538, + "tokens_seen": 155123712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813039117352056, + "loss": 3.4034, + "theoretical_loss": 4.498596029211041, + "tokens_seen": 155189248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048129388164493483, + "loss": 3.4981, + "theoretical_loss": 4.498349003348137, + "tokens_seen": 155254784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048128385155466396, + "loss": 3.5092, + "theoretical_loss": 4.4981021109203185, + "tokens_seen": 155320320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812738214643932, + "loss": 3.4757, + "theoretical_loss": 4.49785535179925, + "tokens_seen": 155385856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812637913741223, + "loss": 3.5368, + "theoretical_loss": 4.497608725856776, + "tokens_seen": 155451392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048125376128385156, + "loss": 3.6676, + "theoretical_loss": 4.497362232964919, + "tokens_seen": 155516928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048124373119358074, + "loss": 3.7426, + "theoretical_loss": 4.497115872995876, + "tokens_seen": 155582464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 278643, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4784624576568604, + "objective/train/theoretical_loss": 4.496869645822022, + "objective/train/tokens_used": 176108000, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812337011033099, + "loss": 3.425, + "theoretical_loss": 4.496869645822022, + "tokens_seen": 155648000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048122367101303916, + "loss": 3.5237, + "theoretical_loss": 4.496623551315908, + "tokens_seen": 155713536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048121364092276834, + "loss": 3.5431, + "theoretical_loss": 4.496377589350261, + "tokens_seen": 155779072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812036108324975, + "loss": 3.5275, + "theoretical_loss": 4.496131759797984, + "tokens_seen": 155844608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811935807422267, + "loss": 3.5453, + "theoretical_loss": 4.495886062532153, + "tokens_seen": 155910144 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811835506519559, + "loss": 3.5227, + "theoretical_loss": 4.495640497426023, + "tokens_seen": 155975680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048117352056168506, + "loss": 3.3562, + "theoretical_loss": 4.495395064353019, + "tokens_seen": 156041216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811634904714143, + "loss": 3.444, + "theoretical_loss": 4.4951497631867445, + "tokens_seen": 156106752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811534603811434, + "loss": 3.6191, + "theoretical_loss": 4.494904593800973, + "tokens_seen": 156172288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048114343029087266, + "loss": 3.3627, + "theoretical_loss": 4.4946595560696565, + "tokens_seen": 156237824 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811334002006018, + "loss": 3.6634, + "theoretical_loss": 4.494414649866915, + "tokens_seen": 156303360 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481123370110331, + "loss": 3.3659, + "theoretical_loss": 4.494169875067046, + "tokens_seen": 156368896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811133400200602, + "loss": 3.4727, + "theoretical_loss": 4.493925231544516, + "tokens_seen": 156434432 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004811033099297894, + "loss": 3.5924, + "theoretical_loss": 4.493680719173968, + "tokens_seen": 156499968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048109327983951857, + "loss": 3.6006, + "theoretical_loss": 4.4934363378302145, + "tokens_seen": 156565504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810832497492478, + "loss": 3.4333, + "theoretical_loss": 4.493192087388239, + "tokens_seen": 156631040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048107321965897693, + "loss": 3.3603, + "theoretical_loss": 4.4929479677232, + "tokens_seen": 156696576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048106318956870616, + "loss": 3.5772, + "theoretical_loss": 4.4927039787104235, + "tokens_seen": 156762112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810531594784353, + "loss": 3.5418, + "theoretical_loss": 4.4924601202254095, + "tokens_seen": 156827648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810431293881645, + "loss": 3.5168, + "theoretical_loss": 4.492216392143826, + "tokens_seen": 156893184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810330992978937, + "loss": 3.5933, + "theoretical_loss": 4.491972794341514, + "tokens_seen": 156958720 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810230692076229, + "loss": 3.5078, + "theoretical_loss": 4.49172932669448, + "tokens_seen": 157024256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048101303911735207, + "loss": 3.5933, + "theoretical_loss": 4.491485989078906, + "tokens_seen": 157089792 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048100300902708125, + "loss": 3.5058, + "theoretical_loss": 4.491242781371138, + "tokens_seen": 157155328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048099297893681043, + "loss": 3.6491, + "theoretical_loss": 4.490999703447697, + "tokens_seen": 157220864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 281475, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.511974334716797, + "objective/train/theoretical_loss": 4.4907567551852665, + "objective/train/tokens_used": 177746400, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048098294884653967, + "loss": 3.5928, + "theoretical_loss": 4.4907567551852665, + "tokens_seen": 157286400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809729187562688, + "loss": 3.5285, + "theoretical_loss": 4.490513936460702, + "tokens_seen": 157351936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048096288866599803, + "loss": 3.5624, + "theoretical_loss": 4.490271247151027, + "tokens_seen": 157417472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048095285857572716, + "loss": 3.6584, + "theoretical_loss": 4.490028687133432, + "tokens_seen": 157483008 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004809428284854564, + "loss": 3.6339, + "theoretical_loss": 4.489786256285276, + "tokens_seen": 157548544 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048093279839518557, + "loss": 3.5268, + "theoretical_loss": 4.489543954484084, + "tokens_seen": 157614080 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048092276830491475, + "loss": 3.4821, + "theoretical_loss": 4.489301781607551, + "tokens_seen": 157679616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048091273821464393, + "loss": 3.572, + "theoretical_loss": 4.489059737533534, + "tokens_seen": 157745152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048090270812437317, + "loss": 3.615, + "theoretical_loss": 4.48881782214006, + "tokens_seen": 157810688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808926780341023, + "loss": 3.6806, + "theoretical_loss": 4.48857603530532, + "tokens_seen": 157876224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048088264794383153, + "loss": 3.6171, + "theoretical_loss": 4.488334376907673, + "tokens_seen": 157941760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048087261785356066, + "loss": 3.5999, + "theoretical_loss": 4.4880928468256425, + "tokens_seen": 158007296 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808625877632899, + "loss": 3.4983, + "theoretical_loss": 4.487851444937916, + "tokens_seen": 158072832 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808525576730191, + "loss": 3.4139, + "theoretical_loss": 4.487610171123347, + "tokens_seen": 158138368 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048084252758274826, + "loss": 3.3477, + "theoretical_loss": 4.487369025260954, + "tokens_seen": 158203904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048083249749247744, + "loss": 3.5383, + "theoretical_loss": 4.48712800722992, + "tokens_seen": 158269440 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808224674022066, + "loss": 3.3682, + "theoretical_loss": 4.48688711690959, + "tokens_seen": 158334976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004808124373119358, + "loss": 3.3346, + "theoretical_loss": 4.486646354179475, + "tokens_seen": 158400512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048080240722166503, + "loss": 3.5343, + "theoretical_loss": 4.48640571891925, + "tokens_seen": 158466048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048079237713139416, + "loss": 3.3801, + "theoretical_loss": 4.48616521100875, + "tokens_seen": 158531584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807823470411234, + "loss": 3.413, + "theoretical_loss": 4.485924830327974, + "tokens_seen": 158597120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807723169508525, + "loss": 3.5698, + "theoretical_loss": 4.485684576757087, + "tokens_seen": 158662656 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048076228686058176, + "loss": 3.4742, + "theoretical_loss": 4.485444450176413, + "tokens_seen": 158728192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048075225677031094, + "loss": 3.6036, + "theoretical_loss": 4.485204450466437, + "tokens_seen": 158793728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807422266800401, + "loss": 3.6036, + "theoretical_loss": 4.484964577507808, + "tokens_seen": 158859264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 282978, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.911756753921509, + "objective/train/theoretical_loss": 4.484724831181337, + "objective/train/tokens_used": 179384800, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807321965897693, + "loss": 3.5403, + "theoretical_loss": 4.484724831181337, + "tokens_seen": 158924800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048072216649949854, + "loss": 3.335, + "theoretical_loss": 4.4844852113679945, + "tokens_seen": 158990336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048071213640922766, + "loss": 3.2906, + "theoretical_loss": 4.484245717948913, + "tokens_seen": 159055872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004807021063189569, + "loss": 3.5695, + "theoretical_loss": 4.484006350805385, + "tokens_seen": 159121408 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480692076228686, + "loss": 3.5224, + "theoretical_loss": 4.483767109818862, + "tokens_seen": 159186944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048068204613841526, + "loss": 3.349, + "theoretical_loss": 4.483527994870958, + "tokens_seen": 159252480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048067201604814444, + "loss": 3.5804, + "theoretical_loss": 4.483289005843445, + "tokens_seen": 159318016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806619859578736, + "loss": 3.3795, + "theoretical_loss": 4.483050142618255, + "tokens_seen": 159383552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806519558676028, + "loss": 3.604, + "theoretical_loss": 4.482811405077482, + "tokens_seen": 159449088 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480641925777332, + "loss": 3.5931, + "theoretical_loss": 4.482572793103373, + "tokens_seen": 159514624 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048063189568706117, + "loss": 3.6163, + "theoretical_loss": 4.482334306578339, + "tokens_seen": 159580160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806218655967904, + "loss": 3.4524, + "theoretical_loss": 4.482095945384946, + "tokens_seen": 159645696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048061183550651953, + "loss": 3.6338, + "theoretical_loss": 4.481857709405919, + "tokens_seen": 159711232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048060180541624877, + "loss": 3.6408, + "theoretical_loss": 4.4816195985241425, + "tokens_seen": 159776768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048059177532597795, + "loss": 3.6232, + "theoretical_loss": 4.481381612622657, + "tokens_seen": 159842304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048058174523570713, + "loss": 3.5465, + "theoretical_loss": 4.481143751584659, + "tokens_seen": 159907840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805717151454363, + "loss": 3.4289, + "theoretical_loss": 4.480906015293505, + "tokens_seen": 159973376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805616850551655, + "loss": 3.4922, + "theoretical_loss": 4.480668403632706, + "tokens_seen": 160038912 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048055165496489467, + "loss": 3.4513, + "theoretical_loss": 4.480430916485929, + "tokens_seen": 160104448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805416248746239, + "loss": 3.5782, + "theoretical_loss": 4.480193553736999, + "tokens_seen": 160169984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048053159478435303, + "loss": 3.43, + "theoretical_loss": 4.479956315269897, + "tokens_seen": 160235520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048052156469408227, + "loss": 3.3231, + "theoretical_loss": 4.479719200968757, + "tokens_seen": 160301056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004805115346038114, + "loss": 3.5921, + "theoretical_loss": 4.479482210717871, + "tokens_seen": 160366592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048050150451354063, + "loss": 3.5253, + "theoretical_loss": 4.479245344401685, + "tokens_seen": 160432128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804914744232698, + "loss": 3.3623, + "theoretical_loss": 4.479008601904798, + "tokens_seen": 160497664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 286119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5894904136657715, + "objective/train/theoretical_loss": 4.478771983111967, + "objective/train/tokens_used": 181023200, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480481444332999, + "loss": 3.639, + "theoretical_loss": 4.478771983111967, + "tokens_seen": 160563200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048047141424272823, + "loss": 3.4376, + "theoretical_loss": 4.478535487908101, + "tokens_seen": 160628736 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048046138415245736, + "loss": 3.5855, + "theoretical_loss": 4.478299116178265, + "tokens_seen": 160694272 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804513540621866, + "loss": 3.4847, + "theoretical_loss": 4.478062867807674, + "tokens_seen": 160759808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048044132397191577, + "loss": 3.6083, + "theoretical_loss": 4.4778267426817, + "tokens_seen": 160825344 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048043129388164495, + "loss": 3.5165, + "theoretical_loss": 4.477590740685867, + "tokens_seen": 160890880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048042126379137413, + "loss": 3.5702, + "theoretical_loss": 4.47735486170585, + "tokens_seen": 160956416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048041123370110337, + "loss": 3.4674, + "theoretical_loss": 4.47711910562748, + "tokens_seen": 161021952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004804012036108325, + "loss": 3.4392, + "theoretical_loss": 4.4768834723367394, + "tokens_seen": 161087488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048039117352056173, + "loss": 3.4705, + "theoretical_loss": 4.4766479617197605, + "tokens_seen": 161153024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048038114343029086, + "loss": 3.503, + "theoretical_loss": 4.476412573662829, + "tokens_seen": 161218560 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803711133400201, + "loss": 3.3453, + "theoretical_loss": 4.4761773080523835, + "tokens_seen": 161284096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803610832497493, + "loss": 3.582, + "theoretical_loss": 4.475942164775013, + "tokens_seen": 161349632 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048035105315947846, + "loss": 3.4554, + "theoretical_loss": 4.475707143717455, + "tokens_seen": 161415168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048034102306920764, + "loss": 3.6966, + "theoretical_loss": 4.475472244766601, + "tokens_seen": 161480704 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004803309929789368, + "loss": 3.5052, + "theoretical_loss": 4.475237467809492, + "tokens_seen": 161546240 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480320962888666, + "loss": 3.6952, + "theoretical_loss": 4.47500281273332, + "tokens_seen": 161611776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048031093279839523, + "loss": 3.3402, + "theoretical_loss": 4.474768279425424, + "tokens_seen": 161677312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048030090270812436, + "loss": 3.4925, + "theoretical_loss": 4.474533867773299, + "tokens_seen": 161742848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802908726178536, + "loss": 3.5017, + "theoretical_loss": 4.474299577664581, + "tokens_seen": 161808384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802808425275827, + "loss": 3.5594, + "theoretical_loss": 4.474065408987063, + "tokens_seen": 161873920 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048027081243731196, + "loss": 3.4176, + "theoretical_loss": 4.473831361628682, + "tokens_seen": 161939456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048026078234704114, + "loss": 3.3287, + "theoretical_loss": 4.473597435477526, + "tokens_seen": 162004992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802507522567703, + "loss": 3.5862, + "theoretical_loss": 4.473363630421831, + "tokens_seen": 162070528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802407221664995, + "loss": 3.3547, + "theoretical_loss": 4.473129946349982, + "tokens_seen": 162136064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 288885, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.936223030090332, + "objective/train/theoretical_loss": 4.472896383150508, + "objective/train/tokens_used": 182661600, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048023069207622874, + "loss": 3.2761, + "theoretical_loss": 4.472896383150508, + "tokens_seen": 162201600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048022066198595786, + "loss": 3.5978, + "theoretical_loss": 4.472662940712091, + "tokens_seen": 162267136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802106318956871, + "loss": 3.574, + "theoretical_loss": 4.472429618923558, + "tokens_seen": 162332672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048020060180541623, + "loss": 3.4758, + "theoretical_loss": 4.472196417673883, + "tokens_seen": 162398208 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048019057171514546, + "loss": 3.4412, + "theoretical_loss": 4.471963336852187, + "tokens_seen": 162463744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048018054162487464, + "loss": 3.4574, + "theoretical_loss": 4.471730376347738, + "tokens_seen": 162529280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801705115346038, + "loss": 3.4413, + "theoretical_loss": 4.4714975360499505, + "tokens_seen": 162594816 + }, + { + "epoch": 0.05, + "learning_rate": 0.000480160481444333, + "loss": 3.6197, + "theoretical_loss": 4.471264815848384, + "tokens_seen": 162660352 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801504513540622, + "loss": 3.5685, + "theoretical_loss": 4.471032215632746, + "tokens_seen": 162725888 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048014042126379137, + "loss": 3.5433, + "theoretical_loss": 4.470799735292889, + "tokens_seen": 162791424 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004801303911735206, + "loss": 3.5198, + "theoretical_loss": 4.470567374718808, + "tokens_seen": 162856960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048012036108324973, + "loss": 3.4672, + "theoretical_loss": 4.470335133800649, + "tokens_seen": 162922496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048011033099297897, + "loss": 3.6525, + "theoretical_loss": 4.470103012428696, + "tokens_seen": 162988032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048010030090270815, + "loss": 3.5195, + "theoretical_loss": 4.469871010493383, + "tokens_seen": 163053568 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048009027081243733, + "loss": 3.5464, + "theoretical_loss": 4.469639127885287, + "tokens_seen": 163119104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800802407221665, + "loss": 3.5581, + "theoretical_loss": 4.4694073644951295, + "tokens_seen": 163184640 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800702106318957, + "loss": 3.5691, + "theoretical_loss": 4.469175720213771, + "tokens_seen": 163250176 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048006018054162487, + "loss": 3.5146, + "theoretical_loss": 4.468944194932225, + "tokens_seen": 163315712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800501504513541, + "loss": 3.4056, + "theoretical_loss": 4.468712788541639, + "tokens_seen": 163381248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048004012036108323, + "loss": 3.6312, + "theoretical_loss": 4.46848150093331, + "tokens_seen": 163446784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048003009027081247, + "loss": 3.586, + "theoretical_loss": 4.468250331998676, + "tokens_seen": 163512320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800200601805416, + "loss": 3.5451, + "theoretical_loss": 4.468019281629316, + "tokens_seen": 163577856 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048001003009027083, + "loss": 3.6071, + "theoretical_loss": 4.467788349716955, + "tokens_seen": 163643392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048, + "loss": 3.5755, + "theoretical_loss": 4.467557536153457, + "tokens_seen": 163708928 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799899699097292, + "loss": 3.586, + "theoretical_loss": 4.467326840830829, + "tokens_seen": 163774464 + }, + { + "debugging/Self-BLEU-5": 0.6356076445329311, + "debugging/distinct-1-grams": 0.7422109121196317, + "debugging/distinct-2-grams": 0.9393524968390767, + "debugging/entropy-1-grams": 6.466535315477175, + "debugging/entropy-2-grams": 7.64213762665985, + "debugging/length": 544.448275862069, + "debugging/num_segments": 29, + "epoch": 0.05, + "objective/train/docs_used": 291020, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7288644313812256, + "objective/train/theoretical_loss": 4.467096263641219, + "objective/train/tokens_used": 184300000, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799799398194584, + "loss": 3.6014, + "theoretical_loss": 4.467096263641219, + "tokens_seen": 163840000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047996990972918756, + "loss": 3.478, + "theoretical_loss": 4.466865804476919, + "tokens_seen": 163905536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047995987963891674, + "loss": 3.4162, + "theoretical_loss": 4.466635463230359, + "tokens_seen": 163971072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047994984954864597, + "loss": 3.4351, + "theoretical_loss": 4.466405239794113, + "tokens_seen": 164036608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799398194583751, + "loss": 3.5307, + "theoretical_loss": 4.466175134060894, + "tokens_seen": 164102144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047992978936810433, + "loss": 3.5838, + "theoretical_loss": 4.465945145923554, + "tokens_seen": 164167680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799197592778335, + "loss": 3.5063, + "theoretical_loss": 4.4657152752750875, + "tokens_seen": 164233216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004799097291875627, + "loss": 3.525, + "theoretical_loss": 4.465485522008629, + "tokens_seen": 164298752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798996990972919, + "loss": 3.5264, + "theoretical_loss": 4.465255886017452, + "tokens_seen": 164364288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047988966900702106, + "loss": 3.4686, + "theoretical_loss": 4.465026367194971, + "tokens_seen": 164429824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047987963891675024, + "loss": 3.4728, + "theoretical_loss": 4.464796965434738, + "tokens_seen": 164495360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798696088264795, + "loss": 3.5737, + "theoretical_loss": 4.464567680630443, + "tokens_seen": 164560896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798595787362086, + "loss": 3.4914, + "theoretical_loss": 4.464338512675919, + "tokens_seen": 164626432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047984954864593784, + "loss": 3.4481, + "theoretical_loss": 4.464109461465133, + "tokens_seen": 164691968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047983951855566696, + "loss": 3.5343, + "theoretical_loss": 4.4638805268921935, + "tokens_seen": 164757504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798294884653962, + "loss": 3.3193, + "theoretical_loss": 4.463651708851346, + "tokens_seen": 164823040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798194583751254, + "loss": 3.5712, + "theoretical_loss": 4.463423007236974, + "tokens_seen": 164888576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047980942828485456, + "loss": 3.5752, + "theoretical_loss": 4.4631944219436, + "tokens_seen": 164954112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047979939819458374, + "loss": 3.5827, + "theoretical_loss": 4.462965952865879, + "tokens_seen": 165019648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797893681043129, + "loss": 3.543, + "theoretical_loss": 4.46273759989861, + "tokens_seen": 165085184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797793380140421, + "loss": 3.3147, + "theoretical_loss": 4.462509362936723, + "tokens_seen": 165150720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047976930792377134, + "loss": 3.3567, + "theoretical_loss": 4.46228124187529, + "tokens_seen": 165216256 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047975927783350047, + "loss": 3.6157, + "theoretical_loss": 4.462053236609516, + "tokens_seen": 165281792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797492477432297, + "loss": 3.4862, + "theoretical_loss": 4.461825347034742, + "tokens_seen": 165347328 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797392176529589, + "loss": 3.493, + "theoretical_loss": 4.461597573046449, + "tokens_seen": 165412864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 293925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5984041690826416, + "objective/train/theoretical_loss": 4.461369914540247, + "objective/train/tokens_used": 185938400, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047972918756268807, + "loss": 3.5903, + "theoretical_loss": 4.461369914540247, + "tokens_seen": 165478400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797191574724173, + "loss": 3.6286, + "theoretical_loss": 4.4611423714118885, + "tokens_seen": 165543936 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047970912738214643, + "loss": 3.5692, + "theoretical_loss": 4.460914943557256, + "tokens_seen": 165609472 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047969909729187566, + "loss": 3.5348, + "theoretical_loss": 4.460687630872371, + "tokens_seen": 165675008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047968906720160484, + "loss": 3.4274, + "theoretical_loss": 4.46046043325339, + "tokens_seen": 165740544 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479679037111334, + "loss": 3.4097, + "theoretical_loss": 4.460233350596599, + "tokens_seen": 165806080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796690070210632, + "loss": 3.4328, + "theoretical_loss": 4.460006382798425, + "tokens_seen": 165871616 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796589769307924, + "loss": 3.5767, + "theoretical_loss": 4.459779529755423, + "tokens_seen": 165937152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047964894684052157, + "loss": 3.5894, + "theoretical_loss": 4.459552791364288, + "tokens_seen": 166002688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796389167502508, + "loss": 3.4712, + "theoretical_loss": 4.459326167521844, + "tokens_seen": 166068224 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047962888665997993, + "loss": 3.4685, + "theoretical_loss": 4.4590996581250515, + "tokens_seen": 166133760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047961885656970917, + "loss": 3.4005, + "theoretical_loss": 4.458873263071002, + "tokens_seen": 166199296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047960882647943835, + "loss": 3.3411, + "theoretical_loss": 4.458646982256921, + "tokens_seen": 166264832 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047959879638916753, + "loss": 3.4968, + "theoretical_loss": 4.458420815580169, + "tokens_seen": 166330368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795887662988967, + "loss": 3.381, + "theoretical_loss": 4.458194762938234, + "tokens_seen": 166395904 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795787362086259, + "loss": 3.5317, + "theoretical_loss": 4.457968824228743, + "tokens_seen": 166461440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047956870611835507, + "loss": 3.3729, + "theoretical_loss": 4.457742999349449, + "tokens_seen": 166526976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795586760280843, + "loss": 3.5432, + "theoretical_loss": 4.4575172881982414, + "tokens_seen": 166592512 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047954864593781343, + "loss": 3.3331, + "theoretical_loss": 4.457291690673139, + "tokens_seen": 166658048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047953861584754267, + "loss": 3.5802, + "theoretical_loss": 4.457066206672291, + "tokens_seen": 166723584 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795285857572718, + "loss": 3.6307, + "theoretical_loss": 4.456840836093983, + "tokens_seen": 166789120 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047951855566700103, + "loss": 3.597, + "theoretical_loss": 4.456615578836625, + "tokens_seen": 166854656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004795085255767302, + "loss": 3.4548, + "theoretical_loss": 4.456390434798762, + "tokens_seen": 166920192 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794984954864594, + "loss": 3.3778, + "theoretical_loss": 4.45616540387907, + "tokens_seen": 166985728 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794884653961886, + "loss": 3.5651, + "theoretical_loss": 4.4559404859763525, + "tokens_seen": 167051264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 296587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6113717555999756, + "objective/train/theoretical_loss": 4.455715680989545, + "objective/train/tokens_used": 187576800, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047947843530591776, + "loss": 3.5897, + "theoretical_loss": 4.455715680989545, + "tokens_seen": 167116800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047946840521564694, + "loss": 3.4152, + "theoretical_loss": 4.455490988817713, + "tokens_seen": 167182336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047945837512537617, + "loss": 3.3392, + "theoretical_loss": 4.4552664093600525, + "tokens_seen": 167247872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794483450351053, + "loss": 3.4531, + "theoretical_loss": 4.455041942515887, + "tokens_seen": 167313408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047943831494483453, + "loss": 3.5399, + "theoretical_loss": 4.454817588184669, + "tokens_seen": 167378944 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794282848545637, + "loss": 3.661, + "theoretical_loss": 4.454593346265984, + "tokens_seen": 167444480 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794182547642929, + "loss": 3.4407, + "theoretical_loss": 4.454369216659542, + "tokens_seen": 167510016 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004794082246740221, + "loss": 3.5553, + "theoretical_loss": 4.454145199265183, + "tokens_seen": 167575552 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047939819458375126, + "loss": 3.3857, + "theoretical_loss": 4.453921293982877, + "tokens_seen": 167641088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047938816449348044, + "loss": 3.5667, + "theoretical_loss": 4.453697500712722, + "tokens_seen": 167706624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793781344032097, + "loss": 3.386, + "theoretical_loss": 4.453473819354942, + "tokens_seen": 167772160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793681043129388, + "loss": 3.32, + "theoretical_loss": 4.453250249809889, + "tokens_seen": 167837696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047935807422266804, + "loss": 3.3859, + "theoretical_loss": 4.453026791978045, + "tokens_seen": 167903232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047934804413239716, + "loss": 3.5149, + "theoretical_loss": 4.4528034457600185, + "tokens_seen": 167968768 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793380140421264, + "loss": 3.5259, + "theoretical_loss": 4.452580211056542, + "tokens_seen": 168034304 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793279839518556, + "loss": 3.7234, + "theoretical_loss": 4.452357087768481, + "tokens_seen": 168099840 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047931795386158476, + "loss": 3.5276, + "theoretical_loss": 4.45213407579682, + "tokens_seen": 168165376 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047930792377131394, + "loss": 3.2304, + "theoretical_loss": 4.451911175042679, + "tokens_seen": 168230912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792978936810431, + "loss": 3.3448, + "theoretical_loss": 4.451688385407296, + "tokens_seen": 168296448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792878635907723, + "loss": 3.6091, + "theoretical_loss": 4.451465706792041, + "tokens_seen": 168361984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047927783350050154, + "loss": 3.7533, + "theoretical_loss": 4.4512431390984055, + "tokens_seen": 168427520 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047926780341023067, + "loss": 3.6563, + "theoretical_loss": 4.451020682228011, + "tokens_seen": 168493056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792577733199599, + "loss": 3.6414, + "theoretical_loss": 4.450798336082601, + "tokens_seen": 168558592 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792477432296891, + "loss": 3.5151, + "theoretical_loss": 4.450576100564046, + "tokens_seen": 168624128 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047923771313941827, + "loss": 3.4532, + "theoretical_loss": 4.450353975574341, + "tokens_seen": 168689664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 299406, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5940263271331787, + "objective/train/theoretical_loss": 4.450131961015606, + "objective/train/tokens_used": 189215200, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047922768304914745, + "loss": 3.5993, + "theoretical_loss": 4.450131961015606, + "tokens_seen": 168755200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047921765295887663, + "loss": 3.4595, + "theoretical_loss": 4.449910056790086, + "tokens_seen": 168820736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004792076228686058, + "loss": 3.5586, + "theoretical_loss": 4.44968826280015, + "tokens_seen": 168886272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047919759277833504, + "loss": 3.4913, + "theoretical_loss": 4.4494665789482895, + "tokens_seen": 168951808 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047918756268806417, + "loss": 3.4411, + "theoretical_loss": 4.449245005137125, + "tokens_seen": 169017344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791775325977934, + "loss": 3.4321, + "theoretical_loss": 4.449023541269395, + "tokens_seen": 169082880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047916750250752253, + "loss": 3.6136, + "theoretical_loss": 4.448802187247966, + "tokens_seen": 169148416 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047915747241725177, + "loss": 3.5009, + "theoretical_loss": 4.448580942975825, + "tokens_seen": 169213952 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047914744232698095, + "loss": 3.3487, + "theoretical_loss": 4.448359808356084, + "tokens_seen": 169279488 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047913741223671013, + "loss": 3.4203, + "theoretical_loss": 4.448138783291979, + "tokens_seen": 169345024 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791273821464393, + "loss": 3.5822, + "theoretical_loss": 4.447917867686863, + "tokens_seen": 169410560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047911735205616855, + "loss": 3.3471, + "theoretical_loss": 4.44769706144422, + "tokens_seen": 169476096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004791073219658977, + "loss": 3.5079, + "theoretical_loss": 4.44747636446765, + "tokens_seen": 169541632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790972918756269, + "loss": 3.3723, + "theoretical_loss": 4.447255776660878, + "tokens_seen": 169607168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047908726178535604, + "loss": 3.5002, + "theoretical_loss": 4.44703529792775, + "tokens_seen": 169672704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047907723169508527, + "loss": 3.3986, + "theoretical_loss": 4.446814928172234, + "tokens_seen": 169738240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047906720160481445, + "loss": 3.4937, + "theoretical_loss": 4.446594667298421, + "tokens_seen": 169803776 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047905717151454363, + "loss": 3.4138, + "theoretical_loss": 4.446374515210521, + "tokens_seen": 169869312 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790471414242728, + "loss": 3.4827, + "theoretical_loss": 4.446154471812866, + "tokens_seen": 169934848 + }, + { + "epoch": 0.05, + "learning_rate": 0.000479037111334002, + "loss": 3.5763, + "theoretical_loss": 4.445934537009911, + "tokens_seen": 170000384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790270812437312, + "loss": 3.3503, + "theoretical_loss": 4.445714710706228, + "tokens_seen": 170065920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004790170511534604, + "loss": 3.4628, + "theoretical_loss": 4.445494992806513, + "tokens_seen": 170131456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047900702106318954, + "loss": 3.4792, + "theoretical_loss": 4.44527538321558, + "tokens_seen": 170196992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789969909729188, + "loss": 3.2559, + "theoretical_loss": 4.445055881838365, + "tokens_seen": 170262528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789869608826479, + "loss": 3.4326, + "theoretical_loss": 4.444836488579924, + "tokens_seen": 170328064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 302236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1966967582702637, + "objective/train/theoretical_loss": 4.44461720334543, + "objective/train/tokens_used": 190853600, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047897693079237714, + "loss": 3.3741, + "theoretical_loss": 4.44461720334543, + "tokens_seen": 170393600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047896690070210637, + "loss": 3.5173, + "theoretical_loss": 4.444398026040179, + "tokens_seen": 170459136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789568706118355, + "loss": 3.5451, + "theoretical_loss": 4.444178956569585, + "tokens_seen": 170524672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047894684052156473, + "loss": 3.5798, + "theoretical_loss": 4.443959994839181, + "tokens_seen": 170590208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789368104312939, + "loss": 3.5062, + "theoretical_loss": 4.44374114075462, + "tokens_seen": 170655744 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789267803410231, + "loss": 3.0981, + "theoretical_loss": 4.443522394221671, + "tokens_seen": 170721280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004789167502507523, + "loss": 3.4202, + "theoretical_loss": 4.443303755146225, + "tokens_seen": 170786816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047890672016048146, + "loss": 3.3335, + "theoretical_loss": 4.443085223434291, + "tokens_seen": 170852352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047889669007021064, + "loss": 3.5163, + "theoretical_loss": 4.442866798991993, + "tokens_seen": 170917888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788866599799399, + "loss": 3.6989, + "theoretical_loss": 4.442648481725577, + "tokens_seen": 170983424 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478876629889669, + "loss": 3.6262, + "theoretical_loss": 4.442430271541404, + "tokens_seen": 171048960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047886659979939824, + "loss": 3.6457, + "theoretical_loss": 4.442212168345956, + "tokens_seen": 171114496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047885656970912736, + "loss": 3.3704, + "theoretical_loss": 4.4419941720458285, + "tokens_seen": 171180032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788465396188566, + "loss": 3.4452, + "theoretical_loss": 4.441776282547736, + "tokens_seen": 171245568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788365095285858, + "loss": 3.5024, + "theoretical_loss": 4.441558499758511, + "tokens_seen": 171311104 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047882647943831496, + "loss": 3.3782, + "theoretical_loss": 4.441340823585101, + "tokens_seen": 171376640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047881644934804414, + "loss": 3.4881, + "theoretical_loss": 4.441123253934572, + "tokens_seen": 171442176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788064192577733, + "loss": 3.3761, + "theoretical_loss": 4.440905790714105, + "tokens_seen": 171507712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787963891675025, + "loss": 3.5054, + "theoretical_loss": 4.440688433830999, + "tokens_seen": 171573248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047878635907723174, + "loss": 3.5374, + "theoretical_loss": 4.440471183192667, + "tokens_seen": 171638784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047877632898696087, + "loss": 3.3218, + "theoretical_loss": 4.440254038706639, + "tokens_seen": 171704320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787662988966901, + "loss": 3.4068, + "theoretical_loss": 4.440037000280561, + "tokens_seen": 171769856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004787562688064193, + "loss": 3.4859, + "theoretical_loss": 4.439820067822195, + "tokens_seen": 171835392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047874623871614847, + "loss": 3.3505, + "theoretical_loss": 4.439603241239416, + "tokens_seen": 171900928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047873620862587765, + "loss": 3.4467, + "theoretical_loss": 4.439386520440218, + "tokens_seen": 171966464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 304754, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4666953086853027, + "objective/train/theoretical_loss": 4.439169905332706, + "objective/train/tokens_used": 192492000, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047872617853560683, + "loss": 3.5022, + "theoretical_loss": 4.439169905332706, + "tokens_seen": 172032000 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478716148445336, + "loss": 3.603, + "theoretical_loss": 4.438953395825102, + "tokens_seen": 172097536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047870611835506524, + "loss": 3.5046, + "theoretical_loss": 4.438736991825744, + "tokens_seen": 172163072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047869608826479437, + "loss": 3.5215, + "theoretical_loss": 4.438520693243079, + "tokens_seen": 172228608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786860581745236, + "loss": 3.225, + "theoretical_loss": 4.4383044999856756, + "tokens_seen": 172294144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047867602808425273, + "loss": 3.5048, + "theoretical_loss": 4.438088411962211, + "tokens_seen": 172359680 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047866599799398197, + "loss": 3.3078, + "theoretical_loss": 4.437872429081477, + "tokens_seen": 172425216 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047865596790371115, + "loss": 3.4777, + "theoretical_loss": 4.437656551252381, + "tokens_seen": 172490752 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047864593781344033, + "loss": 3.6438, + "theoretical_loss": 4.4374407783839445, + "tokens_seen": 172556288 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786359077231695, + "loss": 3.4179, + "theoretical_loss": 4.437225110385297, + "tokens_seen": 172621824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047862587763289875, + "loss": 3.3754, + "theoretical_loss": 4.4370095471656885, + "tokens_seen": 172687360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786158475426279, + "loss": 3.4806, + "theoretical_loss": 4.436794088634477, + "tokens_seen": 172752896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004786058174523571, + "loss": 3.5536, + "theoretical_loss": 4.4365787347011345, + "tokens_seen": 172818432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047859578736208624, + "loss": 3.5273, + "theoretical_loss": 4.436363485275246, + "tokens_seen": 172883968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047858575727181547, + "loss": 3.3341, + "theoretical_loss": 4.436148340266508, + "tokens_seen": 172949504 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047857572718154465, + "loss": 3.4826, + "theoretical_loss": 4.435933299584729, + "tokens_seen": 173015040 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047856569709127383, + "loss": 3.6013, + "theoretical_loss": 4.4357183631398325, + "tokens_seen": 173080576 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478555667001003, + "loss": 3.3086, + "theoretical_loss": 4.435503530841849, + "tokens_seen": 173146112 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785456369107322, + "loss": 3.5736, + "theoretical_loss": 4.435288802600926, + "tokens_seen": 173211648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785356068204614, + "loss": 3.46, + "theoretical_loss": 4.4350741783273175, + "tokens_seen": 173277184 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785255767301906, + "loss": 3.5205, + "theoretical_loss": 4.434859657931392, + "tokens_seen": 173342720 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047851554663991974, + "loss": 3.3527, + "theoretical_loss": 4.434645241323629, + "tokens_seen": 173408256 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478505516549649, + "loss": 3.5177, + "theoretical_loss": 4.434430928414617, + "tokens_seen": 173473792 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784954864593781, + "loss": 3.6308, + "theoretical_loss": 4.434216719115057, + "tokens_seen": 173539328 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047848545636910734, + "loss": 3.5984, + "theoretical_loss": 4.43400261333576, + "tokens_seen": 173604864 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 307553, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7643442153930664, + "objective/train/theoretical_loss": 4.433788610987646, + "objective/train/tokens_used": 194130400, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784754262788365, + "loss": 3.6447, + "theoretical_loss": 4.433788610987646, + "tokens_seen": 173670400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784653961885657, + "loss": 3.4406, + "theoretical_loss": 4.433574711981749, + "tokens_seen": 173735936 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784553660982949, + "loss": 3.5417, + "theoretical_loss": 4.433360916229209, + "tokens_seen": 173801472 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784453360080241, + "loss": 3.5186, + "theoretical_loss": 4.433147223641278, + "tokens_seen": 173867008 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047843530591775324, + "loss": 3.4374, + "theoretical_loss": 4.432933634129318, + "tokens_seen": 173932544 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784252758274825, + "loss": 3.4189, + "theoretical_loss": 4.4327201476047975, + "tokens_seen": 173998080 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784152457372116, + "loss": 3.3408, + "theoretical_loss": 4.432506763979299, + "tokens_seen": 174063616 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047840521564694084, + "loss": 3.3812, + "theoretical_loss": 4.432293483164512, + "tokens_seen": 174129152 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047839518555667, + "loss": 3.3378, + "theoretical_loss": 4.432080305072233, + "tokens_seen": 174194688 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783851554663992, + "loss": 3.4663, + "theoretical_loss": 4.43186722961437, + "tokens_seen": 174260224 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783751253761284, + "loss": 3.5779, + "theoretical_loss": 4.431654256702938, + "tokens_seen": 174325760 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047836509528585756, + "loss": 3.3681, + "theoretical_loss": 4.431441386250063, + "tokens_seen": 174391296 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047835506519558675, + "loss": 3.4954, + "theoretical_loss": 4.4312286181679745, + "tokens_seen": 174456832 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478345035105316, + "loss": 3.3772, + "theoretical_loss": 4.431015952369016, + "tokens_seen": 174522368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783350050150451, + "loss": 3.518, + "theoretical_loss": 4.430803388765636, + "tokens_seen": 174587904 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047832497492477434, + "loss": 3.4375, + "theoretical_loss": 4.430590927270388, + "tokens_seen": 174653440 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047831494483450347, + "loss": 3.4694, + "theoretical_loss": 4.430378567795938, + "tokens_seen": 174718976 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004783049147442327, + "loss": 3.4775, + "theoretical_loss": 4.430166310255057, + "tokens_seen": 174784512 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782948846539619, + "loss": 3.3362, + "theoretical_loss": 4.429954154560624, + "tokens_seen": 174850048 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047828485456369107, + "loss": 3.4176, + "theoretical_loss": 4.429742100625624, + "tokens_seen": 174915584 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047827482447342025, + "loss": 3.456, + "theoretical_loss": 4.429530148363151, + "tokens_seen": 174981120 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782647943831495, + "loss": 3.2957, + "theoretical_loss": 4.429318297686402, + "tokens_seen": 175046656 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782547642928786, + "loss": 3.4751, + "theoretical_loss": 4.429106548508685, + "tokens_seen": 175112192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047824473420260785, + "loss": 3.4639, + "theoretical_loss": 4.428894900743411, + "tokens_seen": 175177728 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478234704112337, + "loss": 3.4396, + "theoretical_loss": 4.428683354304098, + "tokens_seen": 175243264 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 308941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.694732904434204, + "objective/train/theoretical_loss": 4.428471909104372, + "objective/train/tokens_used": 195768800, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782246740220662, + "loss": 3.4705, + "theoretical_loss": 4.428471909104372, + "tokens_seen": 175308800 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047821464393179544, + "loss": 3.4596, + "theoretical_loss": 4.428260565057964, + "tokens_seen": 175374336 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047820461384152457, + "loss": 3.5517, + "theoretical_loss": 4.428049322078708, + "tokens_seen": 175439872 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781945837512538, + "loss": 3.4071, + "theoretical_loss": 4.427838180080547, + "tokens_seen": 175505408 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047818455366098293, + "loss": 3.3692, + "theoretical_loss": 4.4276271389775275, + "tokens_seen": 175570944 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047817452357071217, + "loss": 3.5441, + "theoretical_loss": 4.427416198683803, + "tokens_seen": 175636480 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047816449348044135, + "loss": 3.4034, + "theoretical_loss": 4.427205359113629, + "tokens_seen": 175702016 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047815446339017053, + "loss": 3.3908, + "theoretical_loss": 4.42699462018137, + "tokens_seen": 175767552 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781444332998997, + "loss": 3.623, + "theoretical_loss": 4.42678398180149, + "tokens_seen": 175833088 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047813440320962895, + "loss": 3.5326, + "theoretical_loss": 4.426573443888563, + "tokens_seen": 175898624 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781243731193581, + "loss": 3.4978, + "theoretical_loss": 4.426363006357263, + "tokens_seen": 175964160 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781143430290873, + "loss": 3.6284, + "theoretical_loss": 4.426152669122374, + "tokens_seen": 176029696 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047810431293881644, + "loss": 3.587, + "theoretical_loss": 4.425942432098774, + "tokens_seen": 176095232 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047809428284854567, + "loss": 3.5163, + "theoretical_loss": 4.425732295201455, + "tokens_seen": 176160768 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047808425275827485, + "loss": 3.459, + "theoretical_loss": 4.425522258345508, + "tokens_seen": 176226304 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047807422266800403, + "loss": 3.5971, + "theoretical_loss": 4.425312321446127, + "tokens_seen": 176291840 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780641925777332, + "loss": 3.5135, + "theoretical_loss": 4.425102484418613, + "tokens_seen": 176357376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780541624874624, + "loss": 3.61, + "theoretical_loss": 4.424892747178365, + "tokens_seen": 176422912 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780441323971916, + "loss": 3.4853, + "theoretical_loss": 4.42468310964089, + "tokens_seen": 176488448 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780341023069208, + "loss": 3.3266, + "theoretical_loss": 4.424473571721794, + "tokens_seen": 176553984 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047802407221664994, + "loss": 3.4125, + "theoretical_loss": 4.42426413333679, + "tokens_seen": 176619520 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780140421263792, + "loss": 3.4351, + "theoretical_loss": 4.424054794401689, + "tokens_seen": 176685056 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004780040120361083, + "loss": 3.5899, + "theoretical_loss": 4.423845554832406, + "tokens_seen": 176750592 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047799398194583754, + "loss": 3.5189, + "theoretical_loss": 4.42363641454496, + "tokens_seen": 176816128 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779839518555667, + "loss": 3.4428, + "theoretical_loss": 4.423427373455471, + "tokens_seen": 176881664 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 311765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.407197952270508, + "objective/train/theoretical_loss": 4.42321843148016, + "objective/train/tokens_used": 197407200, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779739217652959, + "loss": 3.5741, + "theoretical_loss": 4.42321843148016, + "tokens_seen": 176947200 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779638916750251, + "loss": 3.3499, + "theoretical_loss": 4.423009588535351, + "tokens_seen": 177012736 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779538615847543, + "loss": 3.6361, + "theoretical_loss": 4.422800844537466, + "tokens_seen": 177078272 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047794383149448344, + "loss": 3.3557, + "theoretical_loss": 4.422592199403036, + "tokens_seen": 177143808 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779338014042127, + "loss": 3.5333, + "theoretical_loss": 4.422383653048685, + "tokens_seen": 177209344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779237713139418, + "loss": 3.5367, + "theoretical_loss": 4.422175205391145, + "tokens_seen": 177274880 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047791374122367104, + "loss": 3.4942, + "theoretical_loss": 4.421966856347243, + "tokens_seen": 177340416 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004779037111334002, + "loss": 3.5612, + "theoretical_loss": 4.421758605833912, + "tokens_seen": 177405952 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778936810431294, + "loss": 3.5643, + "theoretical_loss": 4.421550453768181, + "tokens_seen": 177471488 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778836509528586, + "loss": 3.5665, + "theoretical_loss": 4.421342400067183, + "tokens_seen": 177537024 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047787362086258776, + "loss": 3.6341, + "theoretical_loss": 4.42113444464815, + "tokens_seen": 177602560 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047786359077231695, + "loss": 3.3694, + "theoretical_loss": 4.420926587428411, + "tokens_seen": 177668096 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778535606820462, + "loss": 3.3628, + "theoretical_loss": 4.420718828325403, + "tokens_seen": 177733632 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778435305917753, + "loss": 3.5442, + "theoretical_loss": 4.420511167256656, + "tokens_seen": 177799168 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047783350050150454, + "loss": 3.3775, + "theoretical_loss": 4.4203036041398, + "tokens_seen": 177864704 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047782347041123367, + "loss": 3.5546, + "theoretical_loss": 4.420096138892568, + "tokens_seen": 177930240 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778134403209629, + "loss": 3.3493, + "theoretical_loss": 4.419888771432789, + "tokens_seen": 177995776 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004778034102306921, + "loss": 3.3222, + "theoretical_loss": 4.419681501678395, + "tokens_seen": 178061312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047779338014042127, + "loss": 3.4137, + "theoretical_loss": 4.419474329547413, + "tokens_seen": 178126848 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047778335005015045, + "loss": 3.5327, + "theoretical_loss": 4.419267254957971, + "tokens_seen": 178192384 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777733199598797, + "loss": 3.4228, + "theoretical_loss": 4.419060277828295, + "tokens_seen": 178257920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777632898696088, + "loss": 3.4008, + "theoretical_loss": 4.41885339807671, + "tokens_seen": 178323456 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047775325977933805, + "loss": 3.6441, + "theoretical_loss": 4.4186466156216415, + "tokens_seen": 178388992 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777432296890672, + "loss": 3.6867, + "theoretical_loss": 4.418439930381609, + "tokens_seen": 178454528 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777331995987964, + "loss": 3.5349, + "theoretical_loss": 4.418233342275233, + "tokens_seen": 178520064 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 314648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.918797254562378, + "objective/train/theoretical_loss": 4.418026851221231, + "objective/train/tokens_used": 199045600, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004777231695085256, + "loss": 3.5129, + "theoretical_loss": 4.418026851221231, + "tokens_seen": 178585600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047771313941825477, + "loss": 3.5361, + "theoretical_loss": 4.4178204571384185, + "tokens_seen": 178651136 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047770310932798395, + "loss": 3.4031, + "theoretical_loss": 4.41761415994571, + "tokens_seen": 178716672 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047769307923771313, + "loss": 3.5094, + "theoretical_loss": 4.417407959562116, + "tokens_seen": 178782208 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776830491474423, + "loss": 3.4157, + "theoretical_loss": 4.417201855906742, + "tokens_seen": 178847744 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047767301905717155, + "loss": 3.6327, + "theoretical_loss": 4.416995848898797, + "tokens_seen": 178913280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776629889669007, + "loss": 3.5767, + "theoretical_loss": 4.4167899384575815, + "tokens_seen": 178978816 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776529588766299, + "loss": 3.5093, + "theoretical_loss": 4.416584124502495, + "tokens_seen": 179044352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047764292878635904, + "loss": 3.4126, + "theoretical_loss": 4.416378406953033, + "tokens_seen": 179109888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776328986960883, + "loss": 3.4806, + "theoretical_loss": 4.41617278572879, + "tokens_seen": 179175424 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047762286860581746, + "loss": 3.5097, + "theoretical_loss": 4.4159672607494524, + "tokens_seen": 179240960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047761283851554664, + "loss": 3.389, + "theoretical_loss": 4.415761831934808, + "tokens_seen": 179306496 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004776028084252758, + "loss": 3.4112, + "theoretical_loss": 4.415556499204737, + "tokens_seen": 179372032 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047759277833500505, + "loss": 3.3044, + "theoretical_loss": 4.415351262479216, + "tokens_seen": 179437568 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775827482447342, + "loss": 3.5062, + "theoretical_loss": 4.415146121678321, + "tokens_seen": 179503104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775727181544634, + "loss": 3.3749, + "theoretical_loss": 4.414941076722219, + "tokens_seen": 179568640 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047756268806419254, + "loss": 3.4216, + "theoretical_loss": 4.4147361275311745, + "tokens_seen": 179634176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775526579739218, + "loss": 3.4588, + "theoretical_loss": 4.414531274025548, + "tokens_seen": 179699712 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047754262788365096, + "loss": 3.5234, + "theoretical_loss": 4.414326516125795, + "tokens_seen": 179765248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047753259779338014, + "loss": 3.383, + "theoretical_loss": 4.414121853752466, + "tokens_seen": 179830784 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775225677031093, + "loss": 3.4243, + "theoretical_loss": 4.413917286826205, + "tokens_seen": 179896320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775125376128385, + "loss": 3.4291, + "theoretical_loss": 4.413712815267752, + "tokens_seen": 179961856 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004775025075225677, + "loss": 3.4751, + "theoretical_loss": 4.413508438997944, + "tokens_seen": 180027392 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774924774322969, + "loss": 3.4478, + "theoretical_loss": 4.4133041579377075, + "tokens_seen": 180092928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047748244734202605, + "loss": 3.5437, + "theoretical_loss": 4.413099972008068, + "tokens_seen": 180158464 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 317009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.49520206451416, + "objective/train/theoretical_loss": 4.412895881130142, + "objective/train/tokens_used": 200684000, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774724172517553, + "loss": 3.5079, + "theoretical_loss": 4.412895881130142, + "tokens_seen": 180224000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774623871614845, + "loss": 3.5592, + "theoretical_loss": 4.412691885225141, + "tokens_seen": 180289536 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047745235707121364, + "loss": 3.4354, + "theoretical_loss": 4.412487984214373, + "tokens_seen": 180355072 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774423269809429, + "loss": 3.6014, + "theoretical_loss": 4.412284178019235, + "tokens_seen": 180420608 + }, + { + "epoch": 0.05, + "learning_rate": 0.000477432296890672, + "loss": 3.5539, + "theoretical_loss": 4.412080466561221, + "tokens_seen": 180486144 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047742226680040124, + "loss": 3.6205, + "theoretical_loss": 4.411876849761917, + "tokens_seen": 180551680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774122367101304, + "loss": 3.485, + "theoretical_loss": 4.411673327543005, + "tokens_seen": 180617216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004774022066198596, + "loss": 3.5956, + "theoretical_loss": 4.4114698998262565, + "tokens_seen": 180682752 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773921765295888, + "loss": 3.3422, + "theoretical_loss": 4.411266566533539, + "tokens_seen": 180748288 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047738214643931797, + "loss": 3.4084, + "theoretical_loss": 4.41106332758681, + "tokens_seen": 180813824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047737211634904715, + "loss": 3.4681, + "theoretical_loss": 4.41086018290812, + "tokens_seen": 180879360 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773620862587764, + "loss": 3.4065, + "theoretical_loss": 4.410657132419617, + "tokens_seen": 180944896 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773520561685055, + "loss": 3.3914, + "theoretical_loss": 4.410454176043537, + "tokens_seen": 181010432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047734202607823474, + "loss": 3.4487, + "theoretical_loss": 4.410251313702208, + "tokens_seen": 181075968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047733199598796387, + "loss": 3.2251, + "theoretical_loss": 4.410048545318052, + "tokens_seen": 181141504 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773219658976931, + "loss": 3.4884, + "theoretical_loss": 4.409845870813582, + "tokens_seen": 181207040 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004773119358074223, + "loss": 3.4424, + "theoretical_loss": 4.409643290111404, + "tokens_seen": 181272576 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047730190571715147, + "loss": 3.5641, + "theoretical_loss": 4.409440803134215, + "tokens_seen": 181338112 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047729187562688065, + "loss": 3.4214, + "theoretical_loss": 4.409238409804804, + "tokens_seen": 181403648 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004772818455366099, + "loss": 3.3464, + "theoretical_loss": 4.409036110046051, + "tokens_seen": 181469184 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477271815446339, + "loss": 3.4038, + "theoretical_loss": 4.408833903780926, + "tokens_seen": 181534720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047726178535606825, + "loss": 3.3784, + "theoretical_loss": 4.408631790932494, + "tokens_seen": 181600256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772517552657974, + "loss": 3.4303, + "theoretical_loss": 4.408429771423909, + "tokens_seen": 181665792 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772417251755266, + "loss": 3.4789, + "theoretical_loss": 4.408227845178414, + "tokens_seen": 181731328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004772316950852558, + "loss": 3.3851, + "theoretical_loss": 4.408026012119344, + "tokens_seen": 181796864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 319880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.470914602279663, + "objective/train/theoretical_loss": 4.407824272170128, + "objective/train/tokens_used": 202322400, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047722166499498497, + "loss": 3.3753, + "theoretical_loss": 4.407824272170128, + "tokens_seen": 181862400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047721163490471415, + "loss": 3.5202, + "theoretical_loss": 4.407622625254279, + "tokens_seen": 181927936 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047720160481444333, + "loss": 3.3166, + "theoretical_loss": 4.407421071295406, + "tokens_seen": 181993472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771915747241725, + "loss": 3.2113, + "theoretical_loss": 4.407219610217206, + "tokens_seen": 182059008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047718154463390175, + "loss": 3.4513, + "theoretical_loss": 4.407018241943467, + "tokens_seen": 182124544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771715145436309, + "loss": 3.307, + "theoretical_loss": 4.406816966398064, + "tokens_seen": 182190080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771614844533601, + "loss": 3.4345, + "theoretical_loss": 4.406615783504965, + "tokens_seen": 182255616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047715145436308924, + "loss": 3.533, + "theoretical_loss": 4.4064146931882275, + "tokens_seen": 182321152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771414242728185, + "loss": 3.4924, + "theoretical_loss": 4.406213695371996, + "tokens_seen": 182386688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047713139418254766, + "loss": 3.4217, + "theoretical_loss": 4.406012789980506, + "tokens_seen": 182452224 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047712136409227684, + "loss": 3.5099, + "theoretical_loss": 4.405811976938084, + "tokens_seen": 182517760 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477111334002006, + "loss": 3.4381, + "theoretical_loss": 4.405611256169143, + "tokens_seen": 182583296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047710130391173525, + "loss": 3.4575, + "theoretical_loss": 4.405410627598185, + "tokens_seen": 182648832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770912738214644, + "loss": 3.468, + "theoretical_loss": 4.405210091149802, + "tokens_seen": 182714368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770812437311936, + "loss": 3.5876, + "theoretical_loss": 4.405009646748674, + "tokens_seen": 182779904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047707121364092274, + "loss": 3.3959, + "theoretical_loss": 4.404809294319572, + "tokens_seen": 182845440 + }, + { + "epoch": 0.06, + "learning_rate": 0.000477061183550652, + "loss": 3.5392, + "theoretical_loss": 4.40460903378735, + "tokens_seen": 182910976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047705115346038116, + "loss": 3.437, + "theoretical_loss": 4.404408865076955, + "tokens_seen": 182976512 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047704112337011034, + "loss": 3.4214, + "theoretical_loss": 4.404208788113422, + "tokens_seen": 183042048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770310932798395, + "loss": 3.4463, + "theoretical_loss": 4.404008802821871, + "tokens_seen": 183107584 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770210631895687, + "loss": 3.5783, + "theoretical_loss": 4.4038089091275125, + "tokens_seen": 183173120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770110330992979, + "loss": 3.3004, + "theoretical_loss": 4.403609106955645, + "tokens_seen": 183238656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770010030090271, + "loss": 3.2046, + "theoretical_loss": 4.403409396231651, + "tokens_seen": 183304192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047699097291875625, + "loss": 3.3939, + "theoretical_loss": 4.403209776881004, + "tokens_seen": 183369728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769809428284855, + "loss": 3.4464, + "theoretical_loss": 4.403010248829265, + "tokens_seen": 183435264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 322732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.123575210571289, + "objective/train/theoretical_loss": 4.4028108120020795, + "objective/train/tokens_used": 203960800, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047697091273821466, + "loss": 3.4778, + "theoretical_loss": 4.4028108120020795, + "tokens_seen": 183500800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047696088264794384, + "loss": 3.6034, + "theoretical_loss": 4.402611466325182, + "tokens_seen": 183566336 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476950852557673, + "loss": 3.5015, + "theoretical_loss": 4.4024122117243945, + "tokens_seen": 183631872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769408224674022, + "loss": 3.5805, + "theoretical_loss": 4.402213048125624, + "tokens_seen": 183697408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769307923771314, + "loss": 3.5495, + "theoretical_loss": 4.4020139754548655, + "tokens_seen": 183762944 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769207622868606, + "loss": 3.4697, + "theoretical_loss": 4.401814993638199, + "tokens_seen": 183828480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047691073219658975, + "loss": 3.4042, + "theoretical_loss": 4.4016161026017935, + "tokens_seen": 183894016 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476900702106319, + "loss": 3.5228, + "theoretical_loss": 4.401417302271902, + "tokens_seen": 183959552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768906720160481, + "loss": 3.4242, + "theoretical_loss": 4.401218592574865, + "tokens_seen": 184025088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047688064192577735, + "loss": 3.329, + "theoretical_loss": 4.401019973437108, + "tokens_seen": 184090624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047687061183550653, + "loss": 3.4048, + "theoretical_loss": 4.400821444785143, + "tokens_seen": 184156160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768605817452357, + "loss": 3.5057, + "theoretical_loss": 4.400623006545567, + "tokens_seen": 184221696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768505516549649, + "loss": 3.5541, + "theoretical_loss": 4.400424658645065, + "tokens_seen": 184287232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047684052156469407, + "loss": 3.3083, + "theoretical_loss": 4.400226401010404, + "tokens_seen": 184352768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047683049147442325, + "loss": 3.4043, + "theoretical_loss": 4.40002823356844, + "tokens_seen": 184418304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768204613841525, + "loss": 3.5978, + "theoretical_loss": 4.39983015624611, + "tokens_seen": 184483840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004768104312938816, + "loss": 3.6923, + "theoretical_loss": 4.39963216897044, + "tokens_seen": 184549376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047680040120361085, + "loss": 3.3851, + "theoretical_loss": 4.3994342716685395, + "tokens_seen": 184614912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047679037111334003, + "loss": 3.4629, + "theoretical_loss": 4.399236464267602, + "tokens_seen": 184680448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767803410230692, + "loss": 3.2617, + "theoretical_loss": 4.399038746694908, + "tokens_seen": 184745984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767703109327984, + "loss": 3.3335, + "theoretical_loss": 4.398841118877819, + "tokens_seen": 184811520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767602808425276, + "loss": 3.156, + "theoretical_loss": 4.398643580743785, + "tokens_seen": 184877056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047675025075225676, + "loss": 3.4924, + "theoretical_loss": 4.398446132220338, + "tokens_seen": 184942592 + }, + { + "epoch": 0.06, + "learning_rate": 0.000476740220661986, + "loss": 3.4322, + "theoretical_loss": 4.3982487732350934, + "tokens_seen": 185008128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767301905717151, + "loss": 3.419, + "theoretical_loss": 4.398051503715753, + "tokens_seen": 185073664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 325427, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.745425224304199, + "objective/train/theoretical_loss": 4.397854323590102, + "objective/train/tokens_used": 205599200, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047672016048144435, + "loss": 3.6083, + "theoretical_loss": 4.397854323590102, + "tokens_seen": 185139200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047671013039117353, + "loss": 3.521, + "theoretical_loss": 4.397657232786008, + "tokens_seen": 185204736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767001003009027, + "loss": 3.4482, + "theoretical_loss": 4.397460231231424, + "tokens_seen": 185270272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047669007021063195, + "loss": 3.4954, + "theoretical_loss": 4.397263318854384, + "tokens_seen": 185335808 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766800401203611, + "loss": 3.5176, + "theoretical_loss": 4.39706649558301, + "tokens_seen": 185401344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766700100300903, + "loss": 3.5024, + "theoretical_loss": 4.396869761345503, + "tokens_seen": 185466880 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047665997993981944, + "loss": 3.5173, + "theoretical_loss": 4.396673116070147, + "tokens_seen": 185532416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766499498495487, + "loss": 3.4455, + "theoretical_loss": 4.396476559685315, + "tokens_seen": 185597952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047663991975927786, + "loss": 3.3593, + "theoretical_loss": 4.396280092119455, + "tokens_seen": 185663488 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047662988966900704, + "loss": 3.4833, + "theoretical_loss": 4.3960837133011035, + "tokens_seen": 185729024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004766198595787362, + "loss": 3.5328, + "theoretical_loss": 4.395887423158877, + "tokens_seen": 185794560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047660982948846545, + "loss": 3.4713, + "theoretical_loss": 4.395691221621476, + "tokens_seen": 185860096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765997993981946, + "loss": 3.4366, + "theoretical_loss": 4.395495108617682, + "tokens_seen": 185925632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765897693079238, + "loss": 3.418, + "theoretical_loss": 4.39529908407636, + "tokens_seen": 185991168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047657973921765294, + "loss": 3.4284, + "theoretical_loss": 4.3951031479264575, + "tokens_seen": 186056704 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765697091273822, + "loss": 3.629, + "theoretical_loss": 4.394907300097002, + "tokens_seen": 186122240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047655967903711136, + "loss": 3.4037, + "theoretical_loss": 4.394711540517106, + "tokens_seen": 186187776 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047654964894684054, + "loss": 3.348, + "theoretical_loss": 4.39451586911596, + "tokens_seen": 186253312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765396188565697, + "loss": 3.4593, + "theoretical_loss": 4.39432028582284, + "tokens_seen": 186318848 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765295887662989, + "loss": 3.4393, + "theoretical_loss": 4.394124790567101, + "tokens_seen": 186384384 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765195586760281, + "loss": 3.461, + "theoretical_loss": 4.3939293832781825, + "tokens_seen": 186449920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765095285857573, + "loss": 3.4943, + "theoretical_loss": 4.393734063885599, + "tokens_seen": 186515456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047649949849548645, + "loss": 3.419, + "theoretical_loss": 4.3935388323189555, + "tokens_seen": 186580992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764894684052157, + "loss": 3.5452, + "theoretical_loss": 4.39334368850793, + "tokens_seen": 186646528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047647943831494486, + "loss": 3.2071, + "theoretical_loss": 4.3931486323822835, + "tokens_seen": 186712064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 328230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.658965826034546, + "objective/train/theoretical_loss": 4.392953663871862, + "objective/train/tokens_used": 207237600, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047646940822467404, + "loss": 3.6103, + "theoretical_loss": 4.392953663871862, + "tokens_seen": 186777600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764593781344032, + "loss": 3.3959, + "theoretical_loss": 4.392758782906586, + "tokens_seen": 186843136 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764493480441324, + "loss": 3.3157, + "theoretical_loss": 4.392563989416462, + "tokens_seen": 186908672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764393179538616, + "loss": 3.5188, + "theoretical_loss": 4.392369283331574, + "tokens_seen": 186974208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764292878635908, + "loss": 3.356, + "theoretical_loss": 4.392174664582085, + "tokens_seen": 187039744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047641925777331995, + "loss": 3.338, + "theoretical_loss": 4.391980133098244, + "tokens_seen": 187105280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764092276830492, + "loss": 3.3257, + "theoretical_loss": 4.391785688810373, + "tokens_seen": 187170816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763991975927783, + "loss": 3.5453, + "theoretical_loss": 4.391591331648879, + "tokens_seen": 187236352 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047638916750250755, + "loss": 3.4527, + "theoretical_loss": 4.391397061544247, + "tokens_seen": 187301888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047637913741223673, + "loss": 3.258, + "theoretical_loss": 4.391202878427042, + "tokens_seen": 187367424 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763691073219659, + "loss": 3.5294, + "theoretical_loss": 4.3910087822279085, + "tokens_seen": 187432960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763590772316951, + "loss": 3.2755, + "theoretical_loss": 4.390814772877571, + "tokens_seen": 187498496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047634904714142427, + "loss": 3.4926, + "theoretical_loss": 4.390620850306832, + "tokens_seen": 187564032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047633901705115345, + "loss": 3.5549, + "theoretical_loss": 4.390427014446575, + "tokens_seen": 187629568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763289869608827, + "loss": 3.327, + "theoretical_loss": 4.390233265227764, + "tokens_seen": 187695104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004763189568706118, + "loss": 3.4708, + "theoretical_loss": 4.390039602581437, + "tokens_seen": 187760640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047630892678034105, + "loss": 3.2824, + "theoretical_loss": 4.389846026438715, + "tokens_seen": 187826176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047629889669007023, + "loss": 3.5281, + "theoretical_loss": 4.3896525367307975, + "tokens_seen": 187891712 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762888665997994, + "loss": 3.34, + "theoretical_loss": 4.389459133388962, + "tokens_seen": 187957248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762788365095286, + "loss": 3.3874, + "theoretical_loss": 4.3892658163445635, + "tokens_seen": 188022784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762688064192578, + "loss": 3.2877, + "theoretical_loss": 4.389072585529037, + "tokens_seen": 188088320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047625877632898696, + "loss": 3.2901, + "theoretical_loss": 4.388879440873897, + "tokens_seen": 188153856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762487462387162, + "loss": 3.5192, + "theoretical_loss": 4.388686382310732, + "tokens_seen": 188219392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762387161484453, + "loss": 3.3715, + "theoretical_loss": 4.388493409771213, + "tokens_seen": 188284928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047622868605817455, + "loss": 3.3783, + "theoretical_loss": 4.388300523187087, + "tokens_seen": 188350464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 330800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4630286693573, + "objective/train/theoretical_loss": 4.3881077224901786, + "objective/train/tokens_used": 208876000, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762186559679037, + "loss": 3.4037, + "theoretical_loss": 4.3881077224901786, + "tokens_seen": 188416000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004762086258776329, + "loss": 3.3913, + "theoretical_loss": 4.38791500761239, + "tokens_seen": 188481536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761985957873621, + "loss": 3.5609, + "theoretical_loss": 4.387722378485703, + "tokens_seen": 188547072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761885656970913, + "loss": 3.4083, + "theoretical_loss": 4.3875298350421765, + "tokens_seen": 188612608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047617853560682046, + "loss": 3.3149, + "theoretical_loss": 4.387337377213943, + "tokens_seen": 188678144 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047616850551654964, + "loss": 3.6562, + "theoretical_loss": 4.387145004933218, + "tokens_seen": 188743680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761584754262788, + "loss": 3.4408, + "theoretical_loss": 4.38695271813229, + "tokens_seen": 188809216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047614844533600806, + "loss": 3.3279, + "theoretical_loss": 4.386760516743526, + "tokens_seen": 188874752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761384152457372, + "loss": 3.5432, + "theoretical_loss": 4.38656840069937, + "tokens_seen": 188940288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761283851554664, + "loss": 3.3843, + "theoretical_loss": 4.386376369932344, + "tokens_seen": 189005824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761183550651956, + "loss": 3.2084, + "theoretical_loss": 4.386184424375044, + "tokens_seen": 189071360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004761083249749248, + "loss": 3.4023, + "theoretical_loss": 4.385992563960145, + "tokens_seen": 189136896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047609829488465396, + "loss": 3.2987, + "theoretical_loss": 4.385800788620397, + "tokens_seen": 189202432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047608826479438314, + "loss": 3.4567, + "theoretical_loss": 4.385609098288628, + "tokens_seen": 189267968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760782347041123, + "loss": 3.4383, + "theoretical_loss": 4.385417492897741, + "tokens_seen": 189333504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047606820461384156, + "loss": 3.4959, + "theoretical_loss": 4.385225972380715, + "tokens_seen": 189399040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760581745235707, + "loss": 3.531, + "theoretical_loss": 4.385034536670606, + "tokens_seen": 189464576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760481444332999, + "loss": 3.4409, + "theoretical_loss": 4.384843185700544, + "tokens_seen": 189530112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047603811434302905, + "loss": 3.4162, + "theoretical_loss": 4.384651919403739, + "tokens_seen": 189595648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004760280842527583, + "loss": 3.4179, + "theoretical_loss": 4.384460737713471, + "tokens_seen": 189661184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047601805416248746, + "loss": 3.3534, + "theoretical_loss": 4.384269640563101, + "tokens_seen": 189726720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047600802407221665, + "loss": 3.5377, + "theoretical_loss": 4.384078627886062, + "tokens_seen": 189792256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759979939819458, + "loss": 3.3396, + "theoretical_loss": 4.383887699615863, + "tokens_seen": 189857792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047598796389167506, + "loss": 3.4824, + "theoretical_loss": 4.38369685568609, + "tokens_seen": 189923328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759779338014042, + "loss": 3.3973, + "theoretical_loss": 4.383506096030401, + "tokens_seen": 189988864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 333650, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.499528169631958, + "objective/train/theoretical_loss": 4.383315420582533, + "objective/train/tokens_used": 210514400, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759679037111334, + "loss": 3.5087, + "theoretical_loss": 4.383315420582533, + "tokens_seen": 190054400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759578736208626, + "loss": 3.5494, + "theoretical_loss": 4.383124829276294, + "tokens_seen": 190119936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759478435305918, + "loss": 3.5058, + "theoretical_loss": 4.38293432204557, + "tokens_seen": 190185472 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475937813440321, + "loss": 3.5134, + "theoretical_loss": 4.382743898824321, + "tokens_seen": 190251008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047592778335005015, + "loss": 3.3861, + "theoretical_loss": 4.3825535595465785, + "tokens_seen": 190316544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759177532597794, + "loss": 3.3335, + "theoretical_loss": 4.382363304146453, + "tokens_seen": 190382080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759077231695085, + "loss": 3.4145, + "theoretical_loss": 4.382173132558126, + "tokens_seen": 190447616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047589769307923775, + "loss": 3.4687, + "theoretical_loss": 4.381983044715856, + "tokens_seen": 190513152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047588766298896693, + "loss": 3.4259, + "theoretical_loss": 4.381793040553973, + "tokens_seen": 190578688 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758776328986961, + "loss": 3.4502, + "theoretical_loss": 4.381603120006883, + "tokens_seen": 190644224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758676028084253, + "loss": 3.3304, + "theoretical_loss": 4.381413283009065, + "tokens_seen": 190709760 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047585757271815447, + "loss": 3.5419, + "theoretical_loss": 4.381223529495073, + "tokens_seen": 190775296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047584754262788365, + "loss": 3.4067, + "theoretical_loss": 4.381033859399532, + "tokens_seen": 190840832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004758375125376129, + "loss": 3.4339, + "theoretical_loss": 4.380844272657145, + "tokens_seen": 190906368 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475827482447342, + "loss": 3.4071, + "theoretical_loss": 4.380654769202683, + "tokens_seen": 190971904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047581745235707125, + "loss": 3.3742, + "theoretical_loss": 4.380465348970995, + "tokens_seen": 191037440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047580742226680043, + "loss": 3.614, + "theoretical_loss": 4.380276011897003, + "tokens_seen": 191102976 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757973921765296, + "loss": 3.3705, + "theoretical_loss": 4.380086757915698, + "tokens_seen": 191168512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757873620862588, + "loss": 3.4844, + "theoretical_loss": 4.379897586962148, + "tokens_seen": 191234048 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475777331995988, + "loss": 3.4248, + "theoretical_loss": 4.379708498971494, + "tokens_seen": 191299584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047576730190571716, + "loss": 3.4203, + "theoretical_loss": 4.379519493878948, + "tokens_seen": 191365120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757572718154464, + "loss": 3.3315, + "theoretical_loss": 4.379330571619795, + "tokens_seen": 191430656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757472417251755, + "loss": 3.3467, + "theoretical_loss": 4.379141732129394, + "tokens_seen": 191496192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047573721163490475, + "loss": 3.2782, + "theoretical_loss": 4.378952975343175, + "tokens_seen": 191561728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757271815446339, + "loss": 3.3989, + "theoretical_loss": 4.378764301196642, + "tokens_seen": 191627264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 334982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6754090785980225, + "objective/train/theoretical_loss": 4.37857570962537, + "objective/train/tokens_used": 212152800, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757171514543631, + "loss": 3.5055, + "theoretical_loss": 4.37857570962537, + "tokens_seen": 191692800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004757071213640923, + "loss": 3.2735, + "theoretical_loss": 4.378387200565006, + "tokens_seen": 191758336 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756970912738215, + "loss": 3.4006, + "theoretical_loss": 4.378198773951272, + "tokens_seen": 191823872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047568706118355066, + "loss": 3.2728, + "theoretical_loss": 4.378010429719957, + "tokens_seen": 191889408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047567703109327984, + "loss": 3.3323, + "theoretical_loss": 4.377822167806928, + "tokens_seen": 191954944 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475667001003009, + "loss": 3.4345, + "theoretical_loss": 4.377633988148117, + "tokens_seen": 192020480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047565697091273826, + "loss": 3.1955, + "theoretical_loss": 4.377445890679534, + "tokens_seen": 192086016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756469408224674, + "loss": 3.2745, + "theoretical_loss": 4.377257875337257, + "tokens_seen": 192151552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756369107321966, + "loss": 3.4328, + "theoretical_loss": 4.377069942057436, + "tokens_seen": 192217088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756268806419258, + "loss": 3.4281, + "theoretical_loss": 4.376882090776293, + "tokens_seen": 192282624 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475616850551655, + "loss": 3.3622, + "theoretical_loss": 4.376694321430121, + "tokens_seen": 192348160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047560682046138416, + "loss": 3.527, + "theoretical_loss": 4.376506633955286, + "tokens_seen": 192413696 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047559679037111334, + "loss": 3.276, + "theoretical_loss": 4.376319028288219, + "tokens_seen": 192479232 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755867602808425, + "loss": 3.333, + "theoretical_loss": 4.37613150436543, + "tokens_seen": 192544768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047557673019057176, + "loss": 3.4332, + "theoretical_loss": 4.375944062123496, + "tokens_seen": 192610304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755667001003009, + "loss": 3.3883, + "theoretical_loss": 4.375756701499063, + "tokens_seen": 192675840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755566700100301, + "loss": 3.4159, + "theoretical_loss": 4.3755694224288515, + "tokens_seen": 192741376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047554663991975925, + "loss": 3.4646, + "theoretical_loss": 4.375382224849648, + "tokens_seen": 192806912 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004755366098294885, + "loss": 3.4422, + "theoretical_loss": 4.375195108698316, + "tokens_seen": 192872448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047552657973921766, + "loss": 3.4982, + "theoretical_loss": 4.375008073911781, + "tokens_seen": 192937984 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047551654964894685, + "loss": 3.3865, + "theoretical_loss": 4.374821120427047, + "tokens_seen": 193003520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047550651955867603, + "loss": 3.3777, + "theoretical_loss": 4.374634248181182, + "tokens_seen": 193069056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047549648946840526, + "loss": 3.4966, + "theoretical_loss": 4.3744474571113265, + "tokens_seen": 193134592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754864593781344, + "loss": 3.4671, + "theoretical_loss": 4.374260747154692, + "tokens_seen": 193200128 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754764292878636, + "loss": 3.5804, + "theoretical_loss": 4.374074118248559, + "tokens_seen": 193265664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 337587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.444361925125122, + "objective/train/theoretical_loss": 4.373887570330275, + "objective/train/tokens_used": 213791200, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047546639919759275, + "loss": 3.434, + "theoretical_loss": 4.373887570330275, + "tokens_seen": 193331200 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475456369107322, + "loss": 3.4635, + "theoretical_loss": 4.373701103337263, + "tokens_seen": 193396736 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047544633901705117, + "loss": 3.2998, + "theoretical_loss": 4.373514717207009, + "tokens_seen": 193462272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047543630892678035, + "loss": 3.473, + "theoretical_loss": 4.373328411877073, + "tokens_seen": 193527808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047542627883650953, + "loss": 3.471, + "theoretical_loss": 4.373142187285083, + "tokens_seen": 193593344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754162487462387, + "loss": 3.4301, + "theoretical_loss": 4.372956043368736, + "tokens_seen": 193658880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754062186559679, + "loss": 3.4512, + "theoretical_loss": 4.372769980065797, + "tokens_seen": 193724416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047539618856569713, + "loss": 3.5896, + "theoretical_loss": 4.372583997314104, + "tokens_seen": 193789952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047538615847542625, + "loss": 3.4597, + "theoretical_loss": 4.372398095051559, + "tokens_seen": 193855488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753761283851555, + "loss": 3.6282, + "theoretical_loss": 4.372212273216136, + "tokens_seen": 193921024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753660982948846, + "loss": 3.2805, + "theoretical_loss": 4.372026531745877, + "tokens_seen": 193986560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047535606820461385, + "loss": 3.223, + "theoretical_loss": 4.371840870578891, + "tokens_seen": 194052096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047534603811434303, + "loss": 3.501, + "theoretical_loss": 4.37165528965336, + "tokens_seen": 194117632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753360080240722, + "loss": 3.2599, + "theoretical_loss": 4.371469788907529, + "tokens_seen": 194183168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004753259779338014, + "loss": 3.3896, + "theoretical_loss": 4.371284368279714, + "tokens_seen": 194248704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047531594784353063, + "loss": 3.5411, + "theoretical_loss": 4.3710990277083, + "tokens_seen": 194314240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047530591775325976, + "loss": 3.5489, + "theoretical_loss": 4.3709137671317375, + "tokens_seen": 194379776 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475295887662989, + "loss": 3.3062, + "theoretical_loss": 4.37072858648855, + "tokens_seen": 194445312 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752858575727181, + "loss": 3.406, + "theoretical_loss": 4.370543485717322, + "tokens_seen": 194510848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047527582748244736, + "loss": 3.2224, + "theoretical_loss": 4.370358464756713, + "tokens_seen": 194576384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047526579739217654, + "loss": 3.3225, + "theoretical_loss": 4.370173523545443, + "tokens_seen": 194641920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752557673019057, + "loss": 3.3479, + "theoretical_loss": 4.3699886620223065, + "tokens_seen": 194707456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752457372116349, + "loss": 3.4427, + "theoretical_loss": 4.369803880126162, + "tokens_seen": 194772992 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752357071213641, + "loss": 3.4007, + "theoretical_loss": 4.3696191777959354, + "tokens_seen": 194838528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047522567703109326, + "loss": 3.3588, + "theoretical_loss": 4.369434554970621, + "tokens_seen": 194904064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 340579, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.263646364212036, + "objective/train/theoretical_loss": 4.369250011589279, + "objective/train/tokens_used": 215429600, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752156469408225, + "loss": 3.289, + "theoretical_loss": 4.369250011589279, + "tokens_seen": 194969600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752056168505517, + "loss": 3.4847, + "theoretical_loss": 4.369065547591038, + "tokens_seen": 195035136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047519558676028086, + "loss": 3.3362, + "theoretical_loss": 4.368881162915095, + "tokens_seen": 195100672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047518555667001004, + "loss": 3.4011, + "theoretical_loss": 4.36869685750071, + "tokens_seen": 195166208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751755265797392, + "loss": 3.3437, + "theoretical_loss": 4.3685126312872145, + "tokens_seen": 195231744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047516549648946846, + "loss": 3.4778, + "theoretical_loss": 4.368328484214002, + "tokens_seen": 195297280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751554663991976, + "loss": 3.4841, + "theoretical_loss": 4.368144416220538, + "tokens_seen": 195362816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751454363089268, + "loss": 3.4648, + "theoretical_loss": 4.3679604272463495, + "tokens_seen": 195428352 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475135406218656, + "loss": 3.3709, + "theoretical_loss": 4.367776517231033, + "tokens_seen": 195493888 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751253761283852, + "loss": 3.3574, + "theoretical_loss": 4.367592686114252, + "tokens_seen": 195559424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047511534603811436, + "loss": 3.472, + "theoretical_loss": 4.367408933835733, + "tokens_seen": 195624960 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047510531594784354, + "loss": 3.4464, + "theoretical_loss": 4.367225260335272, + "tokens_seen": 195690496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750952858575727, + "loss": 3.4045, + "theoretical_loss": 4.36704166555273, + "tokens_seen": 195756032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047508525576730196, + "loss": 3.2353, + "theoretical_loss": 4.366858149428032, + "tokens_seen": 195821568 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750752256770311, + "loss": 3.3366, + "theoretical_loss": 4.366674711901173, + "tokens_seen": 195887104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750651955867603, + "loss": 3.4265, + "theoretical_loss": 4.366491352912211, + "tokens_seen": 195952640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047505516549648945, + "loss": 3.3598, + "theoretical_loss": 4.366308072401271, + "tokens_seen": 196018176 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750451354062187, + "loss": 3.3881, + "theoretical_loss": 4.366124870308541, + "tokens_seen": 196083712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047503510531594787, + "loss": 3.4548, + "theoretical_loss": 4.365941746574278, + "tokens_seen": 196149248 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047502507522567705, + "loss": 3.4451, + "theoretical_loss": 4.3657587011388035, + "tokens_seen": 196214784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047501504513540623, + "loss": 3.4696, + "theoretical_loss": 4.365575733942503, + "tokens_seen": 196280320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047500501504513546, + "loss": 3.3357, + "theoretical_loss": 4.365392844925829, + "tokens_seen": 196345856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749949849548646, + "loss": 3.2761, + "theoretical_loss": 4.365210034029298, + "tokens_seen": 196411392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749849548645938, + "loss": 3.3058, + "theoretical_loss": 4.365027301193491, + "tokens_seen": 196476928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047497492477432295, + "loss": 3.424, + "theoretical_loss": 4.364844646359056, + "tokens_seen": 196542464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 343377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.7077269554138184, + "objective/train/theoretical_loss": 4.364662069466704, + "objective/train/tokens_used": 217068000, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749648946840522, + "loss": 3.633, + "theoretical_loss": 4.364662069466704, + "tokens_seen": 196608000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047495486459378137, + "loss": 3.0237, + "theoretical_loss": 4.364479570457213, + "tokens_seen": 196673536 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047494483450351055, + "loss": 3.1958, + "theoretical_loss": 4.364297149271423, + "tokens_seen": 196739072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047493480441323973, + "loss": 3.2984, + "theoretical_loss": 4.3641148058502415, + "tokens_seen": 196804608 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749247743229689, + "loss": 3.4717, + "theoretical_loss": 4.363932540134638, + "tokens_seen": 196870144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004749147442326981, + "loss": 3.5043, + "theoretical_loss": 4.363750352065647, + "tokens_seen": 196935680 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047490471414242733, + "loss": 3.447, + "theoretical_loss": 4.363568241584368, + "tokens_seen": 197001216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047489468405215646, + "loss": 3.3292, + "theoretical_loss": 4.363386208631966, + "tokens_seen": 197066752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748846539618857, + "loss": 3.4757, + "theoretical_loss": 4.363204253149667, + "tokens_seen": 197132288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748746238716148, + "loss": 3.4902, + "theoretical_loss": 4.3630223750787644, + "tokens_seen": 197197824 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047486459378134405, + "loss": 3.3331, + "theoretical_loss": 4.362840574360612, + "tokens_seen": 197263360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047485456369107323, + "loss": 3.4149, + "theoretical_loss": 4.362658850936631, + "tokens_seen": 197328896 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748445336008024, + "loss": 3.3285, + "theoretical_loss": 4.362477204748305, + "tokens_seen": 197394432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748345035105316, + "loss": 3.4553, + "theoretical_loss": 4.362295635737179, + "tokens_seen": 197459968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047482447342026083, + "loss": 3.4526, + "theoretical_loss": 4.362114143844867, + "tokens_seen": 197525504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047481444332998996, + "loss": 3.3795, + "theoretical_loss": 4.3619327290130405, + "tokens_seen": 197591040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004748044132397192, + "loss": 3.4842, + "theoretical_loss": 4.3617513911834385, + "tokens_seen": 197656576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747943831494483, + "loss": 3.297, + "theoretical_loss": 4.361570130297863, + "tokens_seen": 197722112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047478435305917756, + "loss": 3.5321, + "theoretical_loss": 4.3613889462981765, + "tokens_seen": 197787648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047477432296890674, + "loss": 3.3821, + "theoretical_loss": 4.361207839126308, + "tokens_seen": 197853184 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747642928786359, + "loss": 3.4451, + "theoretical_loss": 4.361026808724247, + "tokens_seen": 197918720 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747542627883651, + "loss": 3.2456, + "theoretical_loss": 4.360845855034049, + "tokens_seen": 197984256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747442326980943, + "loss": 3.3037, + "theoretical_loss": 4.360664977997828, + "tokens_seen": 198049792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047473420260782346, + "loss": 3.2944, + "theoretical_loss": 4.360484177557766, + "tokens_seen": 198115328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747241725175527, + "loss": 3.5291, + "theoretical_loss": 4.360303453656103, + "tokens_seen": 198180864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 346182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3122916221618652, + "objective/train/theoretical_loss": 4.360122806235145, + "objective/train/tokens_used": 218706400, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004747141424272818, + "loss": 3.3953, + "theoretical_loss": 4.360122806235145, + "tokens_seen": 198246400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047470411233701106, + "loss": 3.2538, + "theoretical_loss": 4.359942235237257, + "tokens_seen": 198311936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746940822467402, + "loss": 3.4404, + "theoretical_loss": 4.359761740604871, + "tokens_seen": 198377472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746840521564694, + "loss": 3.3443, + "theoretical_loss": 4.359581322280479, + "tokens_seen": 198443008 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746740220661986, + "loss": 3.4766, + "theoretical_loss": 4.359400980206634, + "tokens_seen": 198508544 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746639919759278, + "loss": 3.5114, + "theoretical_loss": 4.359220714325954, + "tokens_seen": 198574080 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047465396188565696, + "loss": 3.2487, + "theoretical_loss": 4.359040524581116, + "tokens_seen": 198639616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746439317953862, + "loss": 3.3942, + "theoretical_loss": 4.358860410914861, + "tokens_seen": 198705152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746339017051153, + "loss": 3.3014, + "theoretical_loss": 4.358680373269993, + "tokens_seen": 198770688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047462387161484456, + "loss": 3.409, + "theoretical_loss": 4.358500411589375, + "tokens_seen": 198836224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746138415245737, + "loss": 3.4366, + "theoretical_loss": 4.358320525815934, + "tokens_seen": 198901760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004746038114343029, + "loss": 3.4101, + "theoretical_loss": 4.358140715892658, + "tokens_seen": 198967296 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745937813440321, + "loss": 3.4232, + "theoretical_loss": 4.357960981762595, + "tokens_seen": 199032832 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745837512537613, + "loss": 3.5503, + "theoretical_loss": 4.357781323368857, + "tokens_seen": 199098368 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047457372116349047, + "loss": 3.5156, + "theoretical_loss": 4.357601740654617, + "tokens_seen": 199163904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047456369107321965, + "loss": 3.4983, + "theoretical_loss": 4.357422233563106, + "tokens_seen": 199229440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047455366098294883, + "loss": 3.4206, + "theoretical_loss": 4.357242802037623, + "tokens_seen": 199294976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047454363089267807, + "loss": 3.1811, + "theoretical_loss": 4.35706344602152, + "tokens_seen": 199360512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745336008024072, + "loss": 3.4024, + "theoretical_loss": 4.356884165458217, + "tokens_seen": 199426048 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047452357071213643, + "loss": 3.4918, + "theoretical_loss": 4.356704960291191, + "tokens_seen": 199491584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047451354062186555, + "loss": 3.303, + "theoretical_loss": 4.35652583046398, + "tokens_seen": 199557120 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004745035105315948, + "loss": 3.4545, + "theoretical_loss": 4.356346775920185, + "tokens_seen": 199622656 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047449348044132397, + "loss": 3.4887, + "theoretical_loss": 4.356167796603467, + "tokens_seen": 199688192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047448345035105315, + "loss": 3.509, + "theoretical_loss": 4.355988892457546, + "tokens_seen": 199753728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744734202607824, + "loss": 3.4006, + "theoretical_loss": 4.355810063426204, + "tokens_seen": 199819264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 348879, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3253164291381836, + "objective/train/theoretical_loss": 4.355631309453283, + "objective/train/tokens_used": 220344800, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047446339017051157, + "loss": 3.3187, + "theoretical_loss": 4.355631309453283, + "tokens_seen": 199884800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047445336008024075, + "loss": 3.2317, + "theoretical_loss": 4.355452630482685, + "tokens_seen": 199950336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047444332998996993, + "loss": 3.3264, + "theoretical_loss": 4.355274026458375, + "tokens_seen": 200015872 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744332998996991, + "loss": 3.4304, + "theoretical_loss": 4.355095497324373, + "tokens_seen": 200081408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004744232698094283, + "loss": 3.3086, + "theoretical_loss": 4.354917043024765, + "tokens_seen": 200146944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047441323971915753, + "loss": 3.4979, + "theoretical_loss": 4.354738663503692, + "tokens_seen": 200212480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047440320962888666, + "loss": 3.415, + "theoretical_loss": 4.354560358705358, + "tokens_seen": 200278016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743931795386159, + "loss": 3.292, + "theoretical_loss": 4.354382128574027, + "tokens_seen": 200343552 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474383149448345, + "loss": 3.3241, + "theoretical_loss": 4.35420397305402, + "tokens_seen": 200409088 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047437311935807425, + "loss": 3.4631, + "theoretical_loss": 4.35402589208972, + "tokens_seen": 200474624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047436308926780343, + "loss": 3.3383, + "theoretical_loss": 4.353847885625571, + "tokens_seen": 200540160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743530591775326, + "loss": 3.6005, + "theoretical_loss": 4.353669953606072, + "tokens_seen": 200605696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743430290872618, + "loss": 3.4298, + "theoretical_loss": 4.353492095975787, + "tokens_seen": 200671232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047433299899699103, + "loss": 3.4692, + "theoretical_loss": 4.353314312679333, + "tokens_seen": 200736768 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047432296890672016, + "loss": 3.2783, + "theoretical_loss": 4.353136603661392, + "tokens_seen": 200802304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743129388164494, + "loss": 3.4814, + "theoretical_loss": 4.352958968866704, + "tokens_seen": 200867840 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004743029087261785, + "loss": 3.5634, + "theoretical_loss": 4.352781408240065, + "tokens_seen": 200933376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047429287863590776, + "loss": 3.4725, + "theoretical_loss": 4.352603921726334, + "tokens_seen": 200998912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047428284854563694, + "loss": 3.3951, + "theoretical_loss": 4.352426509270425, + "tokens_seen": 201064448 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742728184553661, + "loss": 3.3166, + "theoretical_loss": 4.352249170817315, + "tokens_seen": 201129984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742627883650953, + "loss": 3.3642, + "theoretical_loss": 4.352071906312037, + "tokens_seen": 201195520 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742527582748245, + "loss": 3.4696, + "theoretical_loss": 4.351894715699684, + "tokens_seen": 201261056 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047424272818455366, + "loss": 3.4663, + "theoretical_loss": 4.351717598925406, + "tokens_seen": 201326592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742326980942829, + "loss": 3.3806, + "theoretical_loss": 4.351540555934414, + "tokens_seen": 201392128 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474222668004012, + "loss": 3.3337, + "theoretical_loss": 4.351363586671976, + "tokens_seen": 201457664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 350389, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.598541498184204, + "objective/train/theoretical_loss": 4.351186691083417, + "objective/train/tokens_used": 221983200, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047421263791374126, + "loss": 3.2957, + "theoretical_loss": 4.351186691083417, + "tokens_seen": 201523200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742026078234704, + "loss": 3.3647, + "theoretical_loss": 4.351009869114124, + "tokens_seen": 201588736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741925777331996, + "loss": 3.5686, + "theoretical_loss": 4.350833120709539, + "tokens_seen": 201654272 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741825476429288, + "loss": 3.4064, + "theoretical_loss": 4.350656445815164, + "tokens_seen": 201719808 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474172517552658, + "loss": 3.2941, + "theoretical_loss": 4.350479844376557, + "tokens_seen": 201785344 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047416248746238716, + "loss": 3.3579, + "theoretical_loss": 4.350303316339337, + "tokens_seen": 201850880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741524573721164, + "loss": 3.2916, + "theoretical_loss": 4.350126861649178, + "tokens_seen": 201916416 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741424272818455, + "loss": 3.0724, + "theoretical_loss": 4.349950480251813, + "tokens_seen": 201981952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047413239719157476, + "loss": 3.1578, + "theoretical_loss": 4.349774172093033, + "tokens_seen": 202047488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741223671013039, + "loss": 3.4251, + "theoretical_loss": 4.349597937118687, + "tokens_seen": 202113024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741123370110331, + "loss": 3.5322, + "theoretical_loss": 4.3494217752746795, + "tokens_seen": 202178560 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004741023069207623, + "loss": 3.3449, + "theoretical_loss": 4.349245686506976, + "tokens_seen": 202244096 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740922768304915, + "loss": 3.3258, + "theoretical_loss": 4.349069670761597, + "tokens_seen": 202309632 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047408224674022067, + "loss": 3.3632, + "theoretical_loss": 4.348893727984619, + "tokens_seen": 202375168 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047407221664994985, + "loss": 3.2176, + "theoretical_loss": 4.348717858122178, + "tokens_seen": 202440704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047406218655967903, + "loss": 3.3093, + "theoretical_loss": 4.348542061120469, + "tokens_seen": 202506240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047405215646940827, + "loss": 3.2598, + "theoretical_loss": 4.348366336925739, + "tokens_seen": 202571776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004740421263791374, + "loss": 3.2576, + "theoretical_loss": 4.3481906854842975, + "tokens_seen": 202637312 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047403209628886663, + "loss": 3.3318, + "theoretical_loss": 4.348015106742507, + "tokens_seen": 202702848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047402206619859575, + "loss": 3.3485, + "theoretical_loss": 4.347839600646786, + "tokens_seen": 202768384 + }, + { + "epoch": 0.06, + "learning_rate": 0.000474012036108325, + "loss": 3.5222, + "theoretical_loss": 4.347664167143615, + "tokens_seen": 202833920 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047400200601805417, + "loss": 3.504, + "theoretical_loss": 4.347488806179528, + "tokens_seen": 202899456 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047399197592778335, + "loss": 3.3849, + "theoretical_loss": 4.347313517701114, + "tokens_seen": 202964992 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047398194583751253, + "loss": 3.3372, + "theoretical_loss": 4.347138301655021, + "tokens_seen": 203030528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047397191574724177, + "loss": 3.4531, + "theoretical_loss": 4.346963157987954, + "tokens_seen": 203096064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 353190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4621753692626953, + "objective/train/theoretical_loss": 4.346788086646671, + "objective/train/tokens_used": 223621600, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739618856569709, + "loss": 3.36, + "theoretical_loss": 4.346788086646671, + "tokens_seen": 203161600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047395185556670013, + "loss": 3.4479, + "theoretical_loss": 4.346613087577991, + "tokens_seen": 203227136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047394182547642926, + "loss": 3.4229, + "theoretical_loss": 4.346438160728785, + "tokens_seen": 203292672 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739317953861585, + "loss": 3.4067, + "theoretical_loss": 4.346263306045983, + "tokens_seen": 203358208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739217652958877, + "loss": 3.2365, + "theoretical_loss": 4.346088523476569, + "tokens_seen": 203423744 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047391173520561686, + "loss": 3.3116, + "theoretical_loss": 4.345913812967584, + "tokens_seen": 203489280 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047390170511534604, + "loss": 3.3311, + "theoretical_loss": 4.345739174466127, + "tokens_seen": 203554816 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738916750250752, + "loss": 3.1834, + "theoretical_loss": 4.345564607919348, + "tokens_seen": 203620352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738816449348044, + "loss": 3.5808, + "theoretical_loss": 4.3453901132744575, + "tokens_seen": 203685888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047387161484453363, + "loss": 3.4576, + "theoretical_loss": 4.345215690478719, + "tokens_seen": 203751424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047386158475426276, + "loss": 3.4635, + "theoretical_loss": 4.345041339479453, + "tokens_seen": 203816960 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473851554663992, + "loss": 3.4961, + "theoretical_loss": 4.3448670602240345, + "tokens_seen": 203882496 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738415245737211, + "loss": 3.4086, + "theoretical_loss": 4.344692852659895, + "tokens_seen": 203948032 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047383149448345036, + "loss": 3.3979, + "theoretical_loss": 4.34451871673452, + "tokens_seen": 204013568 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047382146439317954, + "loss": 3.6039, + "theoretical_loss": 4.344344652395451, + "tokens_seen": 204079104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738114343029087, + "loss": 3.3413, + "theoretical_loss": 4.3441706595902865, + "tokens_seen": 204144640 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004738014042126379, + "loss": 3.3311, + "theoretical_loss": 4.343996738266677, + "tokens_seen": 204210176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047379137412236714, + "loss": 3.5114, + "theoretical_loss": 4.343822888372331, + "tokens_seen": 204275712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047378134403209626, + "loss": 3.3777, + "theoretical_loss": 4.343649109855009, + "tokens_seen": 204341248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737713139418255, + "loss": 3.2784, + "theoretical_loss": 4.343475402662529, + "tokens_seen": 204406784 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737612838515546, + "loss": 3.3936, + "theoretical_loss": 4.343301766742763, + "tokens_seen": 204472320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047375125376128386, + "loss": 3.352, + "theoretical_loss": 4.343128202043638, + "tokens_seen": 204537856 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047374122367101304, + "loss": 3.3943, + "theoretical_loss": 4.342954708513136, + "tokens_seen": 204603392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737311935807422, + "loss": 3.5171, + "theoretical_loss": 4.342781286099291, + "tokens_seen": 204668928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047372116349047146, + "loss": 3.3275, + "theoretical_loss": 4.3426079347501965, + "tokens_seen": 204734464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 356001, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.544224739074707, + "objective/train/theoretical_loss": 4.342434654413995, + "objective/train/tokens_used": 225260000, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737111334002006, + "loss": 3.3203, + "theoretical_loss": 4.342434654413995, + "tokens_seen": 204800000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004737011033099298, + "loss": 3.4121, + "theoretical_loss": 4.342261445038888, + "tokens_seen": 204865536 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473691073219659, + "loss": 3.4159, + "theoretical_loss": 4.342088306573128, + "tokens_seen": 204931072 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736810431293882, + "loss": 3.3608, + "theoretical_loss": 4.341915238965026, + "tokens_seen": 204996608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047367101303911736, + "loss": 3.3228, + "theoretical_loss": 4.34174224216294, + "tokens_seen": 205062144 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736609829488466, + "loss": 3.5111, + "theoretical_loss": 4.34156931611529, + "tokens_seen": 205127680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736509528585757, + "loss": 3.3938, + "theoretical_loss": 4.341396460770547, + "tokens_seen": 205193216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047364092276830496, + "loss": 3.3652, + "theoretical_loss": 4.341223676077232, + "tokens_seen": 205258752 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736308926780341, + "loss": 3.507, + "theoretical_loss": 4.341050961983926, + "tokens_seen": 205324288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736208625877633, + "loss": 3.454, + "theoretical_loss": 4.340878318439261, + "tokens_seen": 205389824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736108324974925, + "loss": 3.4843, + "theoretical_loss": 4.340705745391922, + "tokens_seen": 205455360 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736008024072217, + "loss": 3.4235, + "theoretical_loss": 4.3405332427906504, + "tokens_seen": 205520896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047359077231695087, + "loss": 3.4169, + "theoretical_loss": 4.340360810584238, + "tokens_seen": 205586432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047358074222668005, + "loss": 3.3008, + "theoretical_loss": 4.340188448721532, + "tokens_seen": 205651968 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047357071213640923, + "loss": 3.1976, + "theoretical_loss": 4.3400161571514335, + "tokens_seen": 205717504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047356068204613847, + "loss": 3.4054, + "theoretical_loss": 4.339843935822895, + "tokens_seen": 205783040 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735506519558676, + "loss": 3.4606, + "theoretical_loss": 4.339671784684923, + "tokens_seen": 205848576 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047354062186559683, + "loss": 3.3849, + "theoretical_loss": 4.339499703686579, + "tokens_seen": 205914112 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047353059177532595, + "loss": 3.3937, + "theoretical_loss": 4.339327692776977, + "tokens_seen": 205979648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004735205616850552, + "loss": 3.3147, + "theoretical_loss": 4.339155751905282, + "tokens_seen": 206045184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047351053159478437, + "loss": 3.4341, + "theoretical_loss": 4.338983881020713, + "tokens_seen": 206110720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047350050150451355, + "loss": 3.4511, + "theoretical_loss": 4.338812080072545, + "tokens_seen": 206176256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047349047141424273, + "loss": 3.411, + "theoretical_loss": 4.338640349010101, + "tokens_seen": 206241792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047348044132397197, + "loss": 3.366, + "theoretical_loss": 4.3384686877827585, + "tokens_seen": 206307328 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734704112337011, + "loss": 3.4013, + "theoretical_loss": 4.338297096339951, + "tokens_seen": 206372864 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 358602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.029078722000122, + "objective/train/theoretical_loss": 4.33812557463116, + "objective/train/tokens_used": 226898400, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047346038114343033, + "loss": 3.2156, + "theoretical_loss": 4.33812557463116, + "tokens_seen": 206438400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047345035105315946, + "loss": 3.5498, + "theoretical_loss": 4.3379541226059235, + "tokens_seen": 206503936 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734403209628887, + "loss": 3.3829, + "theoretical_loss": 4.337782740213827, + "tokens_seen": 206569472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734302908726179, + "loss": 3.5248, + "theoretical_loss": 4.337611427404514, + "tokens_seen": 206635008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047342026078234706, + "loss": 3.4428, + "theoretical_loss": 4.337440184127679, + "tokens_seen": 206700544 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047341023069207624, + "loss": 3.3446, + "theoretical_loss": 4.337269010333065, + "tokens_seen": 206766080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004734002006018054, + "loss": 3.5021, + "theoretical_loss": 4.337097905970471, + "tokens_seen": 206831616 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733901705115346, + "loss": 3.344, + "theoretical_loss": 4.336926870989748, + "tokens_seen": 206897152 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047338014042126383, + "loss": 3.3523, + "theoretical_loss": 4.336755905340797, + "tokens_seen": 206962688 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047337011033099296, + "loss": 3.315, + "theoretical_loss": 4.336585008973573, + "tokens_seen": 207028224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733600802407222, + "loss": 3.4172, + "theoretical_loss": 4.336414181838082, + "tokens_seen": 207093760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733500501504513, + "loss": 3.4818, + "theoretical_loss": 4.336243423884382, + "tokens_seen": 207159296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047334002006018056, + "loss": 3.3718, + "theoretical_loss": 4.336072735062583, + "tokens_seen": 207224832 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047332998996990974, + "loss": 3.4102, + "theoretical_loss": 4.335902115322847, + "tokens_seen": 207290368 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733199598796389, + "loss": 3.487, + "theoretical_loss": 4.335731564615387, + "tokens_seen": 207355904 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733099297893681, + "loss": 3.2426, + "theoretical_loss": 4.335561082890468, + "tokens_seen": 207421440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047329989969909734, + "loss": 3.2495, + "theoretical_loss": 4.335390670098407, + "tokens_seen": 207486976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047328986960882646, + "loss": 3.4002, + "theoretical_loss": 4.335220326189571, + "tokens_seen": 207552512 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732798395185557, + "loss": 3.4586, + "theoretical_loss": 4.335050051114379, + "tokens_seen": 207618048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732698094282848, + "loss": 3.2977, + "theoretical_loss": 4.334879844823304, + "tokens_seen": 207683584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047325977933801406, + "loss": 3.4197, + "theoretical_loss": 4.334709707266865, + "tokens_seen": 207749120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047324974924774324, + "loss": 3.3753, + "theoretical_loss": 4.334539638395636, + "tokens_seen": 207814656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732397191574724, + "loss": 3.5257, + "theoretical_loss": 4.334369638160242, + "tokens_seen": 207880192 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732296890672016, + "loss": 3.3493, + "theoretical_loss": 4.334199706511358, + "tokens_seen": 207945728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004732196589769308, + "loss": 3.4928, + "theoretical_loss": 4.334029843399709, + "tokens_seen": 208011264 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 361329, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.224679946899414, + "objective/train/theoretical_loss": 4.333860048776074, + "objective/train/tokens_used": 228536800, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047320962888665997, + "loss": 3.3191, + "theoretical_loss": 4.333860048776074, + "tokens_seen": 208076800 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731995987963892, + "loss": 3.3568, + "theoretical_loss": 4.33369032259128, + "tokens_seen": 208142336 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047318956870611833, + "loss": 3.1689, + "theoretical_loss": 4.333520664796206, + "tokens_seen": 208207872 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047317953861584757, + "loss": 3.4647, + "theoretical_loss": 4.33335107534178, + "tokens_seen": 208273408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047316950852557675, + "loss": 3.288, + "theoretical_loss": 4.333181554178985, + "tokens_seen": 208338944 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047315947843530593, + "loss": 3.4294, + "theoretical_loss": 4.3330121012588485, + "tokens_seen": 208404480 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731494483450351, + "loss": 3.2777, + "theoretical_loss": 4.332842716532454, + "tokens_seen": 208470016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731394182547643, + "loss": 3.4529, + "theoretical_loss": 4.332673399950932, + "tokens_seen": 208535552 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047312938816449347, + "loss": 3.4269, + "theoretical_loss": 4.332504151465464, + "tokens_seen": 208601088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004731193580742227, + "loss": 3.3666, + "theoretical_loss": 4.332334971027284, + "tokens_seen": 208666624 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047310932798395183, + "loss": 3.3112, + "theoretical_loss": 4.332165858587672, + "tokens_seen": 208732160 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047309929789368107, + "loss": 3.4917, + "theoretical_loss": 4.331996814097963, + "tokens_seen": 208797696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730892678034102, + "loss": 3.3221, + "theoretical_loss": 4.331827837509538, + "tokens_seen": 208863232 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047307923771313943, + "loss": 3.4119, + "theoretical_loss": 4.331658928773831, + "tokens_seen": 208928768 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730692076228686, + "loss": 3.5084, + "theoretical_loss": 4.331490087842324, + "tokens_seen": 208994304 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730591775325978, + "loss": 3.3731, + "theoretical_loss": 4.33132131466655, + "tokens_seen": 209059840 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473049147442327, + "loss": 3.2396, + "theoretical_loss": 4.3311526091980905, + "tokens_seen": 209125376 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047303911735205615, + "loss": 3.3071, + "theoretical_loss": 4.330983971388578, + "tokens_seen": 209190912 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047302908726178534, + "loss": 3.3675, + "theoretical_loss": 4.330815401189695, + "tokens_seen": 209256448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047301905717151457, + "loss": 3.3223, + "theoretical_loss": 4.330646898553173, + "tokens_seen": 209321984 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004730090270812437, + "loss": 3.4297, + "theoretical_loss": 4.330478463430792, + "tokens_seen": 209387520 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047299899699097293, + "loss": 3.1576, + "theoretical_loss": 4.330310095774383, + "tokens_seen": 209453056 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729889669007021, + "loss": 3.3848, + "theoretical_loss": 4.330141795535828, + "tokens_seen": 209518592 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729789368104313, + "loss": 3.431, + "theoretical_loss": 4.329973562667053, + "tokens_seen": 209584128 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047296890672016053, + "loss": 3.3258, + "theoretical_loss": 4.3298053971200385, + "tokens_seen": 209649664 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 364064, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.388387441635132, + "objective/train/theoretical_loss": 4.329637298846812, + "objective/train/tokens_used": 230175200, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047295887662988966, + "loss": 3.3484, + "theoretical_loss": 4.329637298846812, + "tokens_seen": 209715200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729488465396189, + "loss": 3.474, + "theoretical_loss": 4.329469267799451, + "tokens_seen": 209780736 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729388164493481, + "loss": 3.2354, + "theoretical_loss": 4.32930130393008, + "tokens_seen": 209846272 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047292878635907726, + "loss": 3.2676, + "theoretical_loss": 4.329133407190876, + "tokens_seen": 209911808 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047291875626880644, + "loss": 3.3633, + "theoretical_loss": 4.3289655775340625, + "tokens_seen": 209977344 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004729087261785356, + "loss": 3.4231, + "theoretical_loss": 4.328797814911912, + "tokens_seen": 210042880 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728986960882648, + "loss": 3.3633, + "theoretical_loss": 4.328630119276747, + "tokens_seen": 210108416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047288866599799403, + "loss": 3.3672, + "theoretical_loss": 4.328462490580938, + "tokens_seen": 210173952 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047287863590772316, + "loss": 3.5142, + "theoretical_loss": 4.328294928776903, + "tokens_seen": 210239488 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728686058174524, + "loss": 3.529, + "theoretical_loss": 4.328127433817112, + "tokens_seen": 210305024 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728585757271815, + "loss": 3.3344, + "theoretical_loss": 4.327960005654081, + "tokens_seen": 210370560 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047284854563691076, + "loss": 3.213, + "theoretical_loss": 4.327792644240374, + "tokens_seen": 210436096 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047283851554663994, + "loss": 3.2902, + "theoretical_loss": 4.327625349528605, + "tokens_seen": 210501632 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728284854563691, + "loss": 3.5234, + "theoretical_loss": 4.327458121471436, + "tokens_seen": 210567168 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728184553660983, + "loss": 3.5036, + "theoretical_loss": 4.3272909600215765, + "tokens_seen": 210632704 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047280842527582754, + "loss": 3.492, + "theoretical_loss": 4.327123865131786, + "tokens_seen": 210698240 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047279839518555666, + "loss": 3.2377, + "theoretical_loss": 4.326956836754871, + "tokens_seen": 210763776 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727883650952859, + "loss": 3.3697, + "theoretical_loss": 4.326789874843685, + "tokens_seen": 210829312 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472778335005015, + "loss": 3.2794, + "theoretical_loss": 4.326622979351132, + "tokens_seen": 210894848 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047276830491474426, + "loss": 3.3652, + "theoretical_loss": 4.326456150230163, + "tokens_seen": 210960384 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047275827482447344, + "loss": 3.5352, + "theoretical_loss": 4.326289387433776, + "tokens_seen": 211025920 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727482447342026, + "loss": 3.3001, + "theoretical_loss": 4.326122690915017, + "tokens_seen": 211091456 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727382146439318, + "loss": 3.3409, + "theoretical_loss": 4.325956060626982, + "tokens_seen": 211156992 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472728184553661, + "loss": 3.4132, + "theoretical_loss": 4.325789496522812, + "tokens_seen": 211222528 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047271815446339017, + "loss": 3.4523, + "theoretical_loss": 4.325622998555697, + "tokens_seen": 211288064 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 367034, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3237648010253906, + "objective/train/theoretical_loss": 4.3254565666788745, + "objective/train/tokens_used": 231813600, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727081243731194, + "loss": 3.3947, + "theoretical_loss": 4.3254565666788745, + "tokens_seen": 211353600 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047269809428284853, + "loss": 3.4264, + "theoretical_loss": 4.325290200845629, + "tokens_seen": 211419136 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047268806419257777, + "loss": 3.403, + "theoretical_loss": 4.3251239010092934, + "tokens_seen": 211484672 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047267803410230695, + "loss": 3.4886, + "theoretical_loss": 4.324957667123249, + "tokens_seen": 211550208 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047266800401203613, + "loss": 3.4492, + "theoretical_loss": 4.32479149914092, + "tokens_seen": 211615744 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726579739217653, + "loss": 3.3872, + "theoretical_loss": 4.324625397015783, + "tokens_seen": 211681280 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726479438314945, + "loss": 3.428, + "theoretical_loss": 4.3244593607013595, + "tokens_seen": 211746816 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047263791374122367, + "loss": 3.2949, + "theoretical_loss": 4.324293390151218, + "tokens_seen": 211812352 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726278836509529, + "loss": 3.3821, + "theoretical_loss": 4.324127485318975, + "tokens_seen": 211877888 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047261785356068203, + "loss": 3.3246, + "theoretical_loss": 4.323961646158294, + "tokens_seen": 211943424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047260782347041127, + "loss": 3.3483, + "theoretical_loss": 4.323795872622884, + "tokens_seen": 212008960 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725977933801404, + "loss": 3.4509, + "theoretical_loss": 4.323630164666502, + "tokens_seen": 212074496 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047258776328986963, + "loss": 3.3161, + "theoretical_loss": 4.323464522242954, + "tokens_seen": 212140032 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725777331995988, + "loss": 3.4119, + "theoretical_loss": 4.323298945306089, + "tokens_seen": 212205568 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472567703109328, + "loss": 3.4703, + "theoretical_loss": 4.3231334338098035, + "tokens_seen": 212271104 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725576730190572, + "loss": 3.3215, + "theoretical_loss": 4.322967987708043, + "tokens_seen": 212336640 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047254764292878636, + "loss": 3.2984, + "theoretical_loss": 4.322802606954798, + "tokens_seen": 212402176 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047253761283851554, + "loss": 3.221, + "theoretical_loss": 4.322637291504106, + "tokens_seen": 212467712 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047252758274824477, + "loss": 3.0415, + "theoretical_loss": 4.32247204131005, + "tokens_seen": 212533248 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004725175526579739, + "loss": 3.3743, + "theoretical_loss": 4.322306856326761, + "tokens_seen": 212598784 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047250752256770313, + "loss": 3.33, + "theoretical_loss": 4.322141736508415, + "tokens_seen": 212664320 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724974924774323, + "loss": 3.3736, + "theoretical_loss": 4.321976681809236, + "tokens_seen": 212729856 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724874623871615, + "loss": 3.3206, + "theoretical_loss": 4.321811692183491, + "tokens_seen": 212795392 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724774322968907, + "loss": 3.4253, + "theoretical_loss": 4.321646767585497, + "tokens_seen": 212860928 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047246740220661986, + "loss": 3.3229, + "theoretical_loss": 4.3214819079696145, + "tokens_seen": 212926464 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 368444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6967885494232178, + "objective/train/theoretical_loss": 4.321317113290252, + "objective/train/tokens_used": 233452000, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047245737211634904, + "loss": 3.5122, + "theoretical_loss": 4.321317113290252, + "tokens_seen": 212992000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724473420260783, + "loss": 3.5257, + "theoretical_loss": 4.321152383501863, + "tokens_seen": 213057536 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724373119358074, + "loss": 3.3347, + "theoretical_loss": 4.320987718558945, + "tokens_seen": 213123072 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047242728184553664, + "loss": 3.3234, + "theoretical_loss": 4.320823118416046, + "tokens_seen": 213188608 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047241725175526576, + "loss": 3.3791, + "theoretical_loss": 4.320658583027755, + "tokens_seen": 213254144 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472407221664995, + "loss": 3.3, + "theoretical_loss": 4.32049411234871, + "tokens_seen": 213319680 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723971915747242, + "loss": 3.2606, + "theoretical_loss": 4.3203297063335935, + "tokens_seen": 213385216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047238716148445336, + "loss": 3.4321, + "theoretical_loss": 4.320165364937134, + "tokens_seen": 213450752 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047237713139418254, + "loss": 3.5002, + "theoretical_loss": 4.320001088114105, + "tokens_seen": 213516288 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723671013039117, + "loss": 3.3164, + "theoretical_loss": 4.319836875819325, + "tokens_seen": 213581824 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723570712136409, + "loss": 3.3949, + "theoretical_loss": 4.31967272800766, + "tokens_seen": 213647360 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047234704112337014, + "loss": 3.2184, + "theoretical_loss": 4.319508644634021, + "tokens_seen": 213712896 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047233701103309927, + "loss": 3.368, + "theoretical_loss": 4.319344625653361, + "tokens_seen": 213778432 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723269809428285, + "loss": 3.3786, + "theoretical_loss": 4.319180671020684, + "tokens_seen": 213843968 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723169508525577, + "loss": 3.2478, + "theoretical_loss": 4.319016780691033, + "tokens_seen": 213909504 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047230692076228686, + "loss": 3.2954, + "theoretical_loss": 4.318852954619501, + "tokens_seen": 213975040 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047229689067201605, + "loss": 3.5047, + "theoretical_loss": 4.318689192761225, + "tokens_seen": 214040576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722868605817452, + "loss": 3.3439, + "theoretical_loss": 4.318525495071385, + "tokens_seen": 214106112 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004722768304914744, + "loss": 3.3963, + "theoretical_loss": 4.318361861505207, + "tokens_seen": 214171648 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047226680040120364, + "loss": 3.4792, + "theoretical_loss": 4.318198292017964, + "tokens_seen": 214237184 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047225677031093277, + "loss": 3.4319, + "theoretical_loss": 4.318034786564971, + "tokens_seen": 214302720 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472246740220662, + "loss": 3.4209, + "theoretical_loss": 4.31787134510159, + "tokens_seen": 214368256 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047223671013039113, + "loss": 3.3729, + "theoretical_loss": 4.3177079675832255, + "tokens_seen": 214433792 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047222668004012037, + "loss": 3.4369, + "theoretical_loss": 4.317544653965329, + "tokens_seen": 214499328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004722166499498496, + "loss": 3.3998, + "theoretical_loss": 4.3173814042033944, + "tokens_seen": 214564864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 371361, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5362331867218018, + "objective/train/theoretical_loss": 4.317218218252963, + "objective/train/tokens_used": 235090400, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047220661985957873, + "loss": 3.4646, + "theoretical_loss": 4.317218218252963, + "tokens_seen": 214630400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047219658976930797, + "loss": 3.4172, + "theoretical_loss": 4.317055096069618, + "tokens_seen": 214695936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047218655967903715, + "loss": 3.4878, + "theoretical_loss": 4.316892037608987, + "tokens_seen": 214761472 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047217652958876633, + "loss": 3.3838, + "theoretical_loss": 4.316729042826745, + "tokens_seen": 214827008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721664994984955, + "loss": 3.3885, + "theoretical_loss": 4.316566111678609, + "tokens_seen": 214892544 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721564694082247, + "loss": 3.3444, + "theoretical_loss": 4.316403244120339, + "tokens_seen": 214958080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047214643931795387, + "loss": 3.4202, + "theoretical_loss": 4.3162404401077445, + "tokens_seen": 215023616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721364092276831, + "loss": 3.4874, + "theoretical_loss": 4.316077699596671, + "tokens_seen": 215089152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047212637913741223, + "loss": 3.2452, + "theoretical_loss": 4.315915022543016, + "tokens_seen": 215154688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047211634904714147, + "loss": 3.3579, + "theoretical_loss": 4.315752408902716, + "tokens_seen": 215220224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721063189568706, + "loss": 3.3425, + "theoretical_loss": 4.315589858631755, + "tokens_seen": 215285760 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047209628886659983, + "loss": 3.1321, + "theoretical_loss": 4.315427371686157, + "tokens_seen": 215351296 + }, + { + "epoch": 0.07, + "learning_rate": 0.000472086258776329, + "loss": 3.2872, + "theoretical_loss": 4.315264948021994, + "tokens_seen": 215416832 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720762286860582, + "loss": 3.2612, + "theoretical_loss": 4.315102587595379, + "tokens_seen": 215482368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720661985957874, + "loss": 3.3545, + "theoretical_loss": 4.31494029036247, + "tokens_seen": 215547904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047205616850551656, + "loss": 3.5065, + "theoretical_loss": 4.314778056279468, + "tokens_seen": 215613440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047204613841524574, + "loss": 3.3965, + "theoretical_loss": 4.314615885302619, + "tokens_seen": 215678976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047203610832497497, + "loss": 3.4836, + "theoretical_loss": 4.314453777388209, + "tokens_seen": 215744512 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720260782347041, + "loss": 3.4358, + "theoretical_loss": 4.314291732492573, + "tokens_seen": 215810048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047201604814443333, + "loss": 3.1123, + "theoretical_loss": 4.314129750572087, + "tokens_seen": 215875584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004720060180541625, + "loss": 3.2843, + "theoretical_loss": 4.3139678315831675, + "tokens_seen": 215941120 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719959879638917, + "loss": 3.2624, + "theoretical_loss": 4.313805975482278, + "tokens_seen": 216006656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719859578736209, + "loss": 3.5299, + "theoretical_loss": 4.313644182225926, + "tokens_seen": 216072192 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047197592778335006, + "loss": 3.3041, + "theoretical_loss": 4.313482451770659, + "tokens_seen": 216137728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047196589769307924, + "loss": 3.5176, + "theoretical_loss": 4.313320784073069, + "tokens_seen": 216203264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 373872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.197885036468506, + "objective/train/theoretical_loss": 4.3131591790897925, + "objective/train/tokens_used": 236728800, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719558676028085, + "loss": 3.2049, + "theoretical_loss": 4.3131591790897925, + "tokens_seen": 216268800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719458375125376, + "loss": 3.2892, + "theoretical_loss": 4.3129976367775065, + "tokens_seen": 216334336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047193580742226684, + "loss": 3.2975, + "theoretical_loss": 4.312836157092934, + "tokens_seen": 216399872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047192577733199596, + "loss": 3.336, + "theoretical_loss": 4.312674739992839, + "tokens_seen": 216465408 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719157472417252, + "loss": 3.3724, + "theoretical_loss": 4.31251338543403, + "tokens_seen": 216530944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719057171514544, + "loss": 3.2925, + "theoretical_loss": 4.312352093373354, + "tokens_seen": 216596480 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047189568706118356, + "loss": 3.3869, + "theoretical_loss": 4.312190863767708, + "tokens_seen": 216662016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047188565697091274, + "loss": 3.3873, + "theoretical_loss": 4.312029696574027, + "tokens_seen": 216727552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718756268806419, + "loss": 3.3754, + "theoretical_loss": 4.311868591749287, + "tokens_seen": 216793088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718655967903711, + "loss": 3.3157, + "theoretical_loss": 4.311707549250514, + "tokens_seen": 216858624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047185556670010034, + "loss": 3.2795, + "theoretical_loss": 4.311546569034767, + "tokens_seen": 216924160 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047184553660982947, + "loss": 3.3068, + "theoretical_loss": 4.311385651059155, + "tokens_seen": 216989696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718355065195587, + "loss": 3.3936, + "theoretical_loss": 4.311224795280825, + "tokens_seen": 217055232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718254764292879, + "loss": 3.4809, + "theoretical_loss": 4.3110640016569715, + "tokens_seen": 217120768 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047181544633901706, + "loss": 3.4209, + "theoretical_loss": 4.310903270144825, + "tokens_seen": 217186304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047180541624874625, + "loss": 3.4586, + "theoretical_loss": 4.310742600701664, + "tokens_seen": 217251840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717953861584754, + "loss": 3.4732, + "theoretical_loss": 4.310581993284805, + "tokens_seen": 217317376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717853560682046, + "loss": 3.442, + "theoretical_loss": 4.310421447851609, + "tokens_seen": 217382912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047177532597793384, + "loss": 3.3069, + "theoretical_loss": 4.310260964359479, + "tokens_seen": 217448448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047176529588766297, + "loss": 3.3168, + "theoretical_loss": 4.310100542765858, + "tokens_seen": 217513984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717552657973922, + "loss": 3.4094, + "theoretical_loss": 4.309940183028236, + "tokens_seen": 217579520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047174523570712133, + "loss": 3.5125, + "theoretical_loss": 4.309779885104139, + "tokens_seen": 217645056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047173520561685057, + "loss": 3.2747, + "theoretical_loss": 4.309619648951139, + "tokens_seen": 217710592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047172517552657975, + "loss": 3.4483, + "theoretical_loss": 4.3094594745268475, + "tokens_seen": 217776128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047171514543630893, + "loss": 3.3913, + "theoretical_loss": 4.30929936178892, + "tokens_seen": 217841664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 376678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.515946626663208, + "objective/train/theoretical_loss": 4.309139310695053, + "objective/train/tokens_used": 238367200, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004717051153460381, + "loss": 3.2638, + "theoretical_loss": 4.309139310695053, + "tokens_seen": 217907200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047169508525576735, + "loss": 3.433, + "theoretical_loss": 4.308979321202983, + "tokens_seen": 217972736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716850551654965, + "loss": 3.2031, + "theoretical_loss": 4.308819393270491, + "tokens_seen": 218038272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716750250752257, + "loss": 3.4098, + "theoretical_loss": 4.308659526855396, + "tokens_seen": 218103808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047166499498495484, + "loss": 3.3511, + "theoretical_loss": 4.308499721915563, + "tokens_seen": 218169344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047165496489468407, + "loss": 3.4951, + "theoretical_loss": 4.308339978408897, + "tokens_seen": 218234880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047164493480441325, + "loss": 3.4577, + "theoretical_loss": 4.308180296293341, + "tokens_seen": 218300416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047163490471414243, + "loss": 3.2904, + "theoretical_loss": 4.308020675526883, + "tokens_seen": 218365952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716248746238716, + "loss": 3.4817, + "theoretical_loss": 4.307861116067554, + "tokens_seen": 218431488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004716148445336008, + "loss": 3.5937, + "theoretical_loss": 4.30770161787342, + "tokens_seen": 218497024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047160481444333, + "loss": 3.3579, + "theoretical_loss": 4.307542180902594, + "tokens_seen": 218562560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715947843530592, + "loss": 3.2114, + "theoretical_loss": 4.307382805113228, + "tokens_seen": 218628096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047158475426278834, + "loss": 3.3142, + "theoretical_loss": 4.307223490463516, + "tokens_seen": 218693632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715747241725176, + "loss": 3.4309, + "theoretical_loss": 4.307064236911692, + "tokens_seen": 218759168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715646940822467, + "loss": 3.4817, + "theoretical_loss": 4.30690504441603, + "tokens_seen": 218824704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047155466399197594, + "loss": 3.2955, + "theoretical_loss": 4.306745912934849, + "tokens_seen": 218890240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715446339017051, + "loss": 3.1512, + "theoretical_loss": 4.306586842426504, + "tokens_seen": 218955776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715346038114343, + "loss": 3.4091, + "theoretical_loss": 4.306427832849394, + "tokens_seen": 219021312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715245737211635, + "loss": 3.1853, + "theoretical_loss": 4.306268884161959, + "tokens_seen": 219086848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004715145436308927, + "loss": 3.3389, + "theoretical_loss": 4.306109996322679, + "tokens_seen": 219152384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047150451354062184, + "loss": 3.2393, + "theoretical_loss": 4.305951169290073, + "tokens_seen": 219217920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714944834503511, + "loss": 3.4311, + "theoretical_loss": 4.305792403022703, + "tokens_seen": 219283456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714844533600802, + "loss": 3.3593, + "theoretical_loss": 4.305633697479171, + "tokens_seen": 219348992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047147442326980944, + "loss": 3.3141, + "theoretical_loss": 4.305475052618119, + "tokens_seen": 219414528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714643931795387, + "loss": 3.5372, + "theoretical_loss": 4.30531646839823, + "tokens_seen": 219480064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 379444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3172130584716797, + "objective/train/theoretical_loss": 4.305157944778228, + "objective/train/tokens_used": 240005600, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714543630892678, + "loss": 3.4693, + "theoretical_loss": 4.305157944778228, + "tokens_seen": 219545600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047144433299899704, + "loss": 3.3644, + "theoretical_loss": 4.304999481716876, + "tokens_seen": 219611136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047143430290872616, + "loss": 3.4245, + "theoretical_loss": 4.304841079172979, + "tokens_seen": 219676672 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714242728184554, + "loss": 3.3707, + "theoretical_loss": 4.30468273710538, + "tokens_seen": 219742208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004714142427281846, + "loss": 3.4111, + "theoretical_loss": 4.304524455472965, + "tokens_seen": 219807744 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047140421263791376, + "loss": 3.3732, + "theoretical_loss": 4.304366234234659, + "tokens_seen": 219873280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047139418254764294, + "loss": 3.5025, + "theoretical_loss": 4.304208073349426, + "tokens_seen": 219938816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713841524573721, + "loss": 3.3762, + "theoretical_loss": 4.304049972776271, + "tokens_seen": 220004352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713741223671013, + "loss": 3.407, + "theoretical_loss": 4.30389193247424, + "tokens_seen": 220069888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047136409227683054, + "loss": 3.3583, + "theoretical_loss": 4.303733952402419, + "tokens_seen": 220135424 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047135406218655967, + "loss": 3.5579, + "theoretical_loss": 4.303576032519931, + "tokens_seen": 220200960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713440320962889, + "loss": 3.4123, + "theoretical_loss": 4.303418172785943, + "tokens_seen": 220266496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713340020060181, + "loss": 3.5468, + "theoretical_loss": 4.303260373159659, + "tokens_seen": 220332032 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047132397191574726, + "loss": 3.2797, + "theoretical_loss": 4.303102633600322, + "tokens_seen": 220397568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047131394182547645, + "loss": 3.4645, + "theoretical_loss": 4.30294495406722, + "tokens_seen": 220463104 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047130391173520563, + "loss": 3.3902, + "theoretical_loss": 4.3027873345196745, + "tokens_seen": 220528640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712938816449348, + "loss": 3.5021, + "theoretical_loss": 4.302629774917049, + "tokens_seen": 220594176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047128385155466404, + "loss": 3.3986, + "theoretical_loss": 4.302472275218748, + "tokens_seen": 220659712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047127382146439317, + "loss": 3.3479, + "theoretical_loss": 4.302314835384214, + "tokens_seen": 220725248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712637913741224, + "loss": 3.3959, + "theoretical_loss": 4.30215745537293, + "tokens_seen": 220790784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047125376128385153, + "loss": 3.3047, + "theoretical_loss": 4.302000135144416, + "tokens_seen": 220856320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047124373119358077, + "loss": 3.459, + "theoretical_loss": 4.301842874658235, + "tokens_seen": 220921856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047123370110330995, + "loss": 3.3944, + "theoretical_loss": 4.301685673873987, + "tokens_seen": 220987392 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047122367101303913, + "loss": 3.3008, + "theoretical_loss": 4.301528532751312, + "tokens_seen": 221052928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004712136409227683, + "loss": 3.4421, + "theoretical_loss": 4.301371451249888, + "tokens_seen": 221118464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 382238, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2482657432556152, + "objective/train/theoretical_loss": 4.301214429329433, + "objective/train/tokens_used": 241644000, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047120361083249755, + "loss": 3.2581, + "theoretical_loss": 4.301214429329433, + "tokens_seen": 221184000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711935807422267, + "loss": 3.4, + "theoretical_loss": 4.301057466949707, + "tokens_seen": 221249536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711835506519559, + "loss": 3.2736, + "theoretical_loss": 4.300900564070504, + "tokens_seen": 221315072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047117352056168504, + "loss": 3.4197, + "theoretical_loss": 4.30074372065166, + "tokens_seen": 221380608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047116349047141427, + "loss": 3.3698, + "theoretical_loss": 4.300586936653049, + "tokens_seen": 221446144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047115346038114345, + "loss": 3.3907, + "theoretical_loss": 4.300430212034587, + "tokens_seen": 221511680 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047114343029087263, + "loss": 3.2755, + "theoretical_loss": 4.300273546756223, + "tokens_seen": 221577216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711334002006018, + "loss": 3.4788, + "theoretical_loss": 4.300116940777951, + "tokens_seen": 221642752 + }, + { + "epoch": 0.07, + "learning_rate": 0.000471123370110331, + "loss": 3.3725, + "theoretical_loss": 4.299960394059799, + "tokens_seen": 221708288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711133400200602, + "loss": 3.2772, + "theoretical_loss": 4.299803906561835, + "tokens_seen": 221773824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711033099297894, + "loss": 3.3463, + "theoretical_loss": 4.29964747824417, + "tokens_seen": 221839360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047109327983951854, + "loss": 3.32, + "theoretical_loss": 4.299491109066947, + "tokens_seen": 221904896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710832497492478, + "loss": 3.5138, + "theoretical_loss": 4.299334798990351, + "tokens_seen": 221970432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710732196589769, + "loss": 3.3749, + "theoretical_loss": 4.2991785479746065, + "tokens_seen": 222035968 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047106318956870614, + "loss": 3.3312, + "theoretical_loss": 4.299022355979974, + "tokens_seen": 222101504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710531594784353, + "loss": 3.3995, + "theoretical_loss": 4.298866222966755, + "tokens_seen": 222167040 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710431293881645, + "loss": 3.3829, + "theoretical_loss": 4.298710148895286, + "tokens_seen": 222232576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710330992978937, + "loss": 3.3636, + "theoretical_loss": 4.298554133725946, + "tokens_seen": 222298112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710230692076229, + "loss": 3.331, + "theoretical_loss": 4.298398177419149, + "tokens_seen": 222363648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047101303911735204, + "loss": 3.4652, + "theoretical_loss": 4.298242279935349, + "tokens_seen": 222429184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710030090270813, + "loss": 3.2771, + "theoretical_loss": 4.2980864412350375, + "tokens_seen": 222494720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709929789368104, + "loss": 3.3787, + "theoretical_loss": 4.297930661278745, + "tokens_seen": 222560256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047098294884653964, + "loss": 3.2744, + "theoretical_loss": 4.297774940027038, + "tokens_seen": 222625792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709729187562688, + "loss": 3.4303, + "theoretical_loss": 4.297619277440523, + "tokens_seen": 222691328 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470962888665998, + "loss": 3.31, + "theoretical_loss": 4.297463673479846, + "tokens_seen": 222756864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 385121, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.135535955429077, + "objective/train/theoretical_loss": 4.297308128105687, + "objective/train/tokens_used": 243282400, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709528585757272, + "loss": 3.165, + "theoretical_loss": 4.297308128105687, + "tokens_seen": 222822400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047094282848545636, + "loss": 3.2779, + "theoretical_loss": 4.297152641278767, + "tokens_seen": 222887936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047093279839518555, + "loss": 3.4229, + "theoretical_loss": 4.296997212959842, + "tokens_seen": 222953472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709227683049148, + "loss": 3.2624, + "theoretical_loss": 4.296841843109711, + "tokens_seen": 223019008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709127382146439, + "loss": 3.3539, + "theoretical_loss": 4.296686531689204, + "tokens_seen": 223084544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047090270812437314, + "loss": 3.2993, + "theoretical_loss": 4.296531278659193, + "tokens_seen": 223150080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047089267803410227, + "loss": 3.2397, + "theoretical_loss": 4.296376083980589, + "tokens_seen": 223215616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708826479438315, + "loss": 3.4149, + "theoretical_loss": 4.296220947614337, + "tokens_seen": 223281152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708726178535607, + "loss": 3.3635, + "theoretical_loss": 4.296065869521421, + "tokens_seen": 223346688 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047086258776328987, + "loss": 3.3573, + "theoretical_loss": 4.295910849662862, + "tokens_seen": 223412224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047085255767301905, + "loss": 3.4862, + "theoretical_loss": 4.2957558879997215, + "tokens_seen": 223477760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708425275827483, + "loss": 3.3839, + "theoretical_loss": 4.295600984493093, + "tokens_seen": 223543296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708324974924774, + "loss": 3.3545, + "theoretical_loss": 4.295446139104112, + "tokens_seen": 223608832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047082246740220665, + "loss": 3.4375, + "theoretical_loss": 4.295291351793951, + "tokens_seen": 223674368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004708124373119358, + "loss": 3.2596, + "theoretical_loss": 4.295136622523817, + "tokens_seen": 223739904 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470802407221665, + "loss": 3.3996, + "theoretical_loss": 4.294981951254956, + "tokens_seen": 223805440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707923771313942, + "loss": 3.4212, + "theoretical_loss": 4.294827337948651, + "tokens_seen": 223870976 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047078234704112337, + "loss": 3.3905, + "theoretical_loss": 4.294672782566224, + "tokens_seen": 223936512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047077231695085255, + "loss": 3.2894, + "theoretical_loss": 4.29451828506903, + "tokens_seen": 224002048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047076228686058173, + "loss": 3.3979, + "theoretical_loss": 4.294363845418465, + "tokens_seen": 224067584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707522567703109, + "loss": 3.3077, + "theoretical_loss": 4.29420946357596, + "tokens_seen": 224133120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047074222668004015, + "loss": 3.3151, + "theoretical_loss": 4.294055139502985, + "tokens_seen": 224198656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707321965897693, + "loss": 3.2713, + "theoretical_loss": 4.293900873161043, + "tokens_seen": 224264192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707221664994985, + "loss": 3.3814, + "theoretical_loss": 4.293746664511678, + "tokens_seen": 224329728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047071213640922775, + "loss": 3.5934, + "theoretical_loss": 4.293592513516469, + "tokens_seen": 224395264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 387710, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.078070878982544, + "objective/train/theoretical_loss": 4.293438420137031, + "objective/train/tokens_used": 244920800, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004707021063189569, + "loss": 3.2364, + "theoretical_loss": 4.293438420137031, + "tokens_seen": 224460800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706920762286861, + "loss": 3.365, + "theoretical_loss": 4.293284384335017, + "tokens_seen": 224526336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047068204613841524, + "loss": 3.3627, + "theoretical_loss": 4.293130406072118, + "tokens_seen": 224591872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047067201604814447, + "loss": 3.4811, + "theoretical_loss": 4.292976485310057, + "tokens_seen": 224657408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047066198595787365, + "loss": 3.1677, + "theoretical_loss": 4.2928226220106005, + "tokens_seen": 224722944 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047065195586760283, + "loss": 3.3227, + "theoretical_loss": 4.292668816135545, + "tokens_seen": 224788480 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470641925777332, + "loss": 3.415, + "theoretical_loss": 4.292515067646727, + "tokens_seen": 224854016 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706318956870612, + "loss": 3.4118, + "theoretical_loss": 4.29236137650602, + "tokens_seen": 224919552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706218655967904, + "loss": 3.4277, + "theoretical_loss": 4.2922077426753305, + "tokens_seen": 224985088 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004706118355065196, + "loss": 3.3828, + "theoretical_loss": 4.292054166116605, + "tokens_seen": 225050624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047060180541624874, + "loss": 3.3227, + "theoretical_loss": 4.291900646791825, + "tokens_seen": 225116160 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470591775325978, + "loss": 3.4037, + "theoretical_loss": 4.2917471846630075, + "tokens_seen": 225181696 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705817452357071, + "loss": 3.5052, + "theoretical_loss": 4.291593779692207, + "tokens_seen": 225247232 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047057171514543634, + "loss": 3.4105, + "theoretical_loss": 4.291440431841513, + "tokens_seen": 225312768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705616850551655, + "loss": 3.4572, + "theoretical_loss": 4.291287141073053, + "tokens_seen": 225378304 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705516549648947, + "loss": 3.1627, + "theoretical_loss": 4.291133907348989, + "tokens_seen": 225443840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705416248746239, + "loss": 3.2843, + "theoretical_loss": 4.29098073063152, + "tokens_seen": 225509376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705315947843531, + "loss": 3.3572, + "theoretical_loss": 4.29082761088288, + "tokens_seen": 225574912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047052156469408224, + "loss": 3.2793, + "theoretical_loss": 4.290674548065338, + "tokens_seen": 225640448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705115346038115, + "loss": 3.4328, + "theoretical_loss": 4.290521542141203, + "tokens_seen": 225705984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705015045135406, + "loss": 3.3848, + "theoretical_loss": 4.290368593072817, + "tokens_seen": 225771520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047049147442326984, + "loss": 3.3278, + "theoretical_loss": 4.290215700822556, + "tokens_seen": 225837056 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470481444332999, + "loss": 3.2474, + "theoretical_loss": 4.290062865352837, + "tokens_seen": 225902592 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704714142427282, + "loss": 3.3278, + "theoretical_loss": 4.289910086626108, + "tokens_seen": 225968128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704613841524574, + "loss": 3.3861, + "theoretical_loss": 4.289757364604855, + "tokens_seen": 226033664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 389112, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2922399044036865, + "objective/train/theoretical_loss": 4.2896046992515995, + "objective/train/tokens_used": 246559200, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047045135406218656, + "loss": 3.3395, + "theoretical_loss": 4.2896046992515995, + "tokens_seen": 226099200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047044132397191575, + "loss": 3.1382, + "theoretical_loss": 4.289452090528897, + "tokens_seen": 226164736 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470431293881645, + "loss": 3.3452, + "theoretical_loss": 4.289299538399341, + "tokens_seen": 226230272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004704212637913741, + "loss": 3.2479, + "theoretical_loss": 4.28914704282556, + "tokens_seen": 226295808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047041123370110334, + "loss": 3.3841, + "theoretical_loss": 4.288994603770215, + "tokens_seen": 226361344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047040120361083247, + "loss": 3.1905, + "theoretical_loss": 4.288842221196007, + "tokens_seen": 226426880 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703911735205617, + "loss": 3.4169, + "theoretical_loss": 4.28868989506567, + "tokens_seen": 226492416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703811434302909, + "loss": 3.3683, + "theoretical_loss": 4.288537625341974, + "tokens_seen": 226557952 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047037111334002007, + "loss": 3.1758, + "theoretical_loss": 4.288385411987722, + "tokens_seen": 226623488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047036108324974925, + "loss": 3.3084, + "theoretical_loss": 4.288233254965755, + "tokens_seen": 226689024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703510531594785, + "loss": 3.3416, + "theoretical_loss": 4.2880811542389505, + "tokens_seen": 226754560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703410230692076, + "loss": 3.4587, + "theoretical_loss": 4.287929109770217, + "tokens_seen": 226820096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047033099297893685, + "loss": 3.3101, + "theoretical_loss": 4.287777121522501, + "tokens_seen": 226885632 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470320962888666, + "loss": 3.4654, + "theoretical_loss": 4.287625189458781, + "tokens_seen": 226951168 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703109327983952, + "loss": 3.4646, + "theoretical_loss": 4.287473313542077, + "tokens_seen": 227016704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004703009027081244, + "loss": 3.2567, + "theoretical_loss": 4.287321493735438, + "tokens_seen": 227082240 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047029087261785357, + "loss": 3.3931, + "theoretical_loss": 4.287169730001949, + "tokens_seen": 227147776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047028084252758275, + "loss": 3.454, + "theoretical_loss": 4.287018022304733, + "tokens_seen": 227213312 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047027081243731193, + "loss": 3.1141, + "theoretical_loss": 4.286866370606943, + "tokens_seen": 227278848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702607823470411, + "loss": 3.3568, + "theoretical_loss": 4.286714774871772, + "tokens_seen": 227344384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047025075225677035, + "loss": 3.3087, + "theoretical_loss": 4.286563235062444, + "tokens_seen": 227409920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702407221664995, + "loss": 3.3862, + "theoretical_loss": 4.28641175114222, + "tokens_seen": 227475456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702306920762287, + "loss": 3.3772, + "theoretical_loss": 4.286260323074394, + "tokens_seen": 227540992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047022066198595784, + "loss": 3.4206, + "theoretical_loss": 4.286108950822296, + "tokens_seen": 227606528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004702106318956871, + "loss": 3.4532, + "theoretical_loss": 4.285957634349289, + "tokens_seen": 227672064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 391912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.699742317199707, + "objective/train/theoretical_loss": 4.285806373618774, + "objective/train/tokens_used": 248197600, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047020060180541626, + "loss": 3.4425, + "theoretical_loss": 4.285806373618774, + "tokens_seen": 227737600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047019057171514544, + "loss": 3.2776, + "theoretical_loss": 4.285655168594182, + "tokens_seen": 227803136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701805416248746, + "loss": 3.3048, + "theoretical_loss": 4.285504019238982, + "tokens_seen": 227868672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047017051153460385, + "loss": 3.3141, + "theoretical_loss": 4.285352925516676, + "tokens_seen": 227934208 + }, + { + "epoch": 0.07, + "learning_rate": 0.000470160481444333, + "loss": 3.3072, + "theoretical_loss": 4.2852018873908, + "tokens_seen": 227999744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701504513540622, + "loss": 3.5011, + "theoretical_loss": 4.285050904824925, + "tokens_seen": 228065280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047014042126379134, + "loss": 3.3456, + "theoretical_loss": 4.284899977782658, + "tokens_seen": 228130816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701303911735206, + "loss": 3.4321, + "theoretical_loss": 4.284749106227636, + "tokens_seen": 228196352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047012036108324976, + "loss": 3.377, + "theoretical_loss": 4.284598290123535, + "tokens_seen": 228261888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047011033099297894, + "loss": 3.3713, + "theoretical_loss": 4.284447529434061, + "tokens_seen": 228327424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004701003009027081, + "loss": 3.2215, + "theoretical_loss": 4.284296824122959, + "tokens_seen": 228392960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700902708124373, + "loss": 3.2989, + "theoretical_loss": 4.284146174154003, + "tokens_seen": 228458496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700802407221665, + "loss": 3.3728, + "theoretical_loss": 4.283995579491004, + "tokens_seen": 228524032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700702106318957, + "loss": 3.4452, + "theoretical_loss": 4.283845040097807, + "tokens_seen": 228589568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047006018054162484, + "loss": 3.2764, + "theoretical_loss": 4.28369455593829, + "tokens_seen": 228655104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700501504513541, + "loss": 3.2551, + "theoretical_loss": 4.2835441269763646, + "tokens_seen": 228720640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047004012036108326, + "loss": 3.3373, + "theoretical_loss": 4.283393753175979, + "tokens_seen": 228786176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047003009027081244, + "loss": 3.5512, + "theoretical_loss": 4.283243434501112, + "tokens_seen": 228851712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700200601805416, + "loss": 3.2344, + "theoretical_loss": 4.283093170915778, + "tokens_seen": 228917248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004700100300902708, + "loss": 3.3867, + "theoretical_loss": 4.282942962384023, + "tokens_seen": 228982784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047, + "loss": 3.3632, + "theoretical_loss": 4.282792808869932, + "tokens_seen": 229048320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699899699097292, + "loss": 3.2307, + "theoretical_loss": 4.282642710337618, + "tokens_seen": 229113856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046997993981945835, + "loss": 3.3991, + "theoretical_loss": 4.28249266675123, + "tokens_seen": 229179392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699699097291876, + "loss": 3.3958, + "theoretical_loss": 4.282342678074951, + "tokens_seen": 229244928 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046995987963891676, + "loss": 3.3667, + "theoretical_loss": 4.2821927442729955, + "tokens_seen": 229310464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 394727, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.46366810798645, + "objective/train/theoretical_loss": 4.282042865309616, + "objective/train/tokens_used": 249836000, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046994984954864595, + "loss": 3.3033, + "theoretical_loss": 4.282042865309616, + "tokens_seen": 229376000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699398194583752, + "loss": 3.333, + "theoretical_loss": 4.281893041149093, + "tokens_seen": 229441536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699297893681043, + "loss": 3.3091, + "theoretical_loss": 4.2817432717557455, + "tokens_seen": 229507072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046991975927783354, + "loss": 3.403, + "theoretical_loss": 4.28159355709392, + "tokens_seen": 229572608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046990972918756267, + "loss": 3.2928, + "theoretical_loss": 4.281443897128004, + "tokens_seen": 229638144 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698996990972919, + "loss": 3.4463, + "theoretical_loss": 4.2812942918224115, + "tokens_seen": 229703680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698896690070211, + "loss": 3.2958, + "theoretical_loss": 4.281144741141593, + "tokens_seen": 229769216 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046987963891675027, + "loss": 3.2948, + "theoretical_loss": 4.280995245050032, + "tokens_seen": 229834752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046986960882647945, + "loss": 3.3462, + "theoretical_loss": 4.2808458035122445, + "tokens_seen": 229900288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698595787362087, + "loss": 3.375, + "theoretical_loss": 4.2806964164927805, + "tokens_seen": 229965824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698495486459378, + "loss": 3.4837, + "theoretical_loss": 4.280547083956224, + "tokens_seen": 230031360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046983951855566705, + "loss": 3.1836, + "theoretical_loss": 4.280397805867188, + "tokens_seen": 230096896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698294884653962, + "loss": 3.3435, + "theoretical_loss": 4.280248582190324, + "tokens_seen": 230162432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698194583751254, + "loss": 3.4223, + "theoretical_loss": 4.280099412890312, + "tokens_seen": 230227968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004698094282848546, + "loss": 3.3549, + "theoretical_loss": 4.279950297931869, + "tokens_seen": 230293504 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046979939819458377, + "loss": 3.3682, + "theoretical_loss": 4.27980123727974, + "tokens_seen": 230359040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046978936810431295, + "loss": 3.3786, + "theoretical_loss": 4.279652230898709, + "tokens_seen": 230424576 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046977933801404213, + "loss": 3.3574, + "theoretical_loss": 4.279503278753586, + "tokens_seen": 230490112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697693079237713, + "loss": 3.3595, + "theoretical_loss": 4.27935438080922, + "tokens_seen": 230555648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046975927783350055, + "loss": 3.4349, + "theoretical_loss": 4.27920553703049, + "tokens_seen": 230621184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697492477432297, + "loss": 3.3751, + "theoretical_loss": 4.279056747382306, + "tokens_seen": 230686720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697392176529589, + "loss": 3.3152, + "theoretical_loss": 4.278908011829613, + "tokens_seen": 230752256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046972918756268804, + "loss": 3.3812, + "theoretical_loss": 4.27875933033739, + "tokens_seen": 230817792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697191574724173, + "loss": 3.4473, + "theoretical_loss": 4.278610702870646, + "tokens_seen": 230883328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046970912738214646, + "loss": 3.3841, + "theoretical_loss": 4.278462129394423, + "tokens_seen": 230948864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 397483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6735587120056152, + "objective/train/theoretical_loss": 4.278313609873795, + "objective/train/tokens_used": 251474400, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046969909729187564, + "loss": 3.3431, + "theoretical_loss": 4.278313609873795, + "tokens_seen": 231014400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696890672016048, + "loss": 3.3619, + "theoretical_loss": 4.278165144273871, + "tokens_seen": 231079936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046967903711133405, + "loss": 3.4647, + "theoretical_loss": 4.27801673255979, + "tokens_seen": 231145472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696690070210632, + "loss": 3.1252, + "theoretical_loss": 4.277868374696725, + "tokens_seen": 231211008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696589769307924, + "loss": 3.3235, + "theoretical_loss": 4.277720070649879, + "tokens_seen": 231276544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046964894684052154, + "loss": 3.4981, + "theoretical_loss": 4.277571820384491, + "tokens_seen": 231342080 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696389167502508, + "loss": 3.4923, + "theoretical_loss": 4.277423623865829, + "tokens_seen": 231407616 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046962888665997996, + "loss": 3.4392, + "theoretical_loss": 4.277275481059195, + "tokens_seen": 231473152 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046961885656970914, + "loss": 3.3443, + "theoretical_loss": 4.2771273919299215, + "tokens_seen": 231538688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696088264794383, + "loss": 3.3522, + "theoretical_loss": 4.276979356443377, + "tokens_seen": 231604224 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695987963891675, + "loss": 3.2476, + "theoretical_loss": 4.276831374564957, + "tokens_seen": 231669760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695887662988967, + "loss": 3.3512, + "theoretical_loss": 4.276683446260093, + "tokens_seen": 231735296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695787362086259, + "loss": 3.4507, + "theoretical_loss": 4.276535571494247, + "tokens_seen": 231800832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046956870611835505, + "loss": 3.3167, + "theoretical_loss": 4.276387750232913, + "tokens_seen": 231866368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695586760280843, + "loss": 3.3295, + "theoretical_loss": 4.276239982441617, + "tokens_seen": 231931904 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046954864593781346, + "loss": 3.4821, + "theoretical_loss": 4.276092268085918, + "tokens_seen": 231997440 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046953861584754264, + "loss": 3.3292, + "theoretical_loss": 4.275944607131406, + "tokens_seen": 232062976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695285857572718, + "loss": 3.3861, + "theoretical_loss": 4.275796999543703, + "tokens_seen": 232128512 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469518555667001, + "loss": 3.3606, + "theoretical_loss": 4.275649445288461, + "tokens_seen": 232194048 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004695085255767302, + "loss": 3.3623, + "theoretical_loss": 4.275501944331367, + "tokens_seen": 232259584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694984954864594, + "loss": 3.3591, + "theoretical_loss": 4.275354496638139, + "tokens_seen": 232325120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046948846539618855, + "loss": 3.4444, + "theoretical_loss": 4.275207102174525, + "tokens_seen": 232390656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694784353059178, + "loss": 3.39, + "theoretical_loss": 4.275059760906305, + "tokens_seen": 232456192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694684052156469, + "loss": 3.1499, + "theoretical_loss": 4.2749124727992935, + "tokens_seen": 232521728 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046945837512537615, + "loss": 3.3641, + "theoretical_loss": 4.274765237819333, + "tokens_seen": 232587264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 399784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4592337608337402, + "objective/train/theoretical_loss": 4.274618055932298, + "objective/train/tokens_used": 253112800, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694483450351053, + "loss": 3.5259, + "theoretical_loss": 4.274618055932298, + "tokens_seen": 232652800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694383149448345, + "loss": 3.2075, + "theoretical_loss": 4.2744709271040975, + "tokens_seen": 232718336 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004694282848545637, + "loss": 3.4248, + "theoretical_loss": 4.27432385130067, + "tokens_seen": 232783872 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046941825476429287, + "loss": 3.4602, + "theoretical_loss": 4.274176828487984, + "tokens_seen": 232849408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046940822467402205, + "loss": 3.5396, + "theoretical_loss": 4.2740298586320415, + "tokens_seen": 232914944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693981945837513, + "loss": 3.2887, + "theoretical_loss": 4.273882941698876, + "tokens_seen": 232980480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693881644934804, + "loss": 3.3351, + "theoretical_loss": 4.27373607765455, + "tokens_seen": 233046016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046937813440320965, + "loss": 3.2714, + "theoretical_loss": 4.2735892664651605, + "tokens_seen": 233111552 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046936810431293883, + "loss": 3.4656, + "theoretical_loss": 4.273442508096833, + "tokens_seen": 233177088 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469358074222668, + "loss": 3.4278, + "theoretical_loss": 4.273295802515726, + "tokens_seen": 233242624 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693480441323972, + "loss": 3.3639, + "theoretical_loss": 4.273149149688028, + "tokens_seen": 233308160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693380140421264, + "loss": 3.1441, + "theoretical_loss": 4.27300254957996, + "tokens_seen": 233373696 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046932798395185555, + "loss": 3.4209, + "theoretical_loss": 4.272856002157772, + "tokens_seen": 233439232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693179538615848, + "loss": 3.2964, + "theoretical_loss": 4.272709507387748, + "tokens_seen": 233504768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004693079237713139, + "loss": 3.3027, + "theoretical_loss": 4.2725630652362, + "tokens_seen": 233570304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046929789368104315, + "loss": 3.3066, + "theoretical_loss": 4.272416675669473, + "tokens_seen": 233635840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692878635907723, + "loss": 3.1362, + "theoretical_loss": 4.272270338653942, + "tokens_seen": 233701376 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692778335005015, + "loss": 3.4389, + "theoretical_loss": 4.272124054156014, + "tokens_seen": 233766912 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692678034102307, + "loss": 3.3281, + "theoretical_loss": 4.271977822142125, + "tokens_seen": 233832448 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692577733199599, + "loss": 3.4307, + "theoretical_loss": 4.271831642578745, + "tokens_seen": 233897984 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046924774322968906, + "loss": 3.5186, + "theoretical_loss": 4.27168551543237, + "tokens_seen": 233963520 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046923771313941824, + "loss": 3.3157, + "theoretical_loss": 4.271539440669532, + "tokens_seen": 234029056 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692276830491474, + "loss": 3.3622, + "theoretical_loss": 4.27139341825679, + "tokens_seen": 234094592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046921765295887666, + "loss": 3.3443, + "theoretical_loss": 4.271247448160736, + "tokens_seen": 234160128 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046920762286860584, + "loss": 3.2366, + "theoretical_loss": 4.27110153034799, + "tokens_seen": 234225664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 402735, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.518681049346924, + "objective/train/theoretical_loss": 4.270955664785207, + "objective/train/tokens_used": 254751200, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469197592778335, + "loss": 3.3184, + "theoretical_loss": 4.270955664785207, + "tokens_seen": 234291200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046918756268806425, + "loss": 3.2765, + "theoretical_loss": 4.2708098514390676, + "tokens_seen": 234356736 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691775325977934, + "loss": 3.2732, + "theoretical_loss": 4.270664090276286, + "tokens_seen": 234422272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691675025075226, + "loss": 3.3131, + "theoretical_loss": 4.2705183812636065, + "tokens_seen": 234487808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046915747241725174, + "loss": 3.4021, + "theoretical_loss": 4.270372724367803, + "tokens_seen": 234553344 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469147442326981, + "loss": 3.3925, + "theoretical_loss": 4.270227119555681, + "tokens_seen": 234618880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046913741223671016, + "loss": 3.5184, + "theoretical_loss": 4.270081566794076, + "tokens_seen": 234684416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046912738214643934, + "loss": 3.3248, + "theoretical_loss": 4.269936066049852, + "tokens_seen": 234749952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691173520561685, + "loss": 3.2262, + "theoretical_loss": 4.269790617289907, + "tokens_seen": 234815488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004691073219658977, + "loss": 3.2708, + "theoretical_loss": 4.269645220481166, + "tokens_seen": 234881024 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690972918756269, + "loss": 3.1757, + "theoretical_loss": 4.269499875590587, + "tokens_seen": 234946560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690872617853561, + "loss": 3.2782, + "theoretical_loss": 4.269354582585156, + "tokens_seen": 235012096 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046907723169508525, + "loss": 3.4499, + "theoretical_loss": 4.269209341431889, + "tokens_seen": 235077632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690672016048145, + "loss": 3.403, + "theoretical_loss": 4.269064152097835, + "tokens_seen": 235143168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046905717151454366, + "loss": 3.5655, + "theoretical_loss": 4.26891901455007, + "tokens_seen": 235208704 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046904714142427284, + "loss": 3.3014, + "theoretical_loss": 4.268773928755701, + "tokens_seen": 235274240 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469037111334002, + "loss": 3.2371, + "theoretical_loss": 4.268628894681868, + "tokens_seen": 235339776 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690270812437312, + "loss": 3.2151, + "theoretical_loss": 4.268483912295735, + "tokens_seen": 235405312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690170511534604, + "loss": 3.1058, + "theoretical_loss": 4.268338981564502, + "tokens_seen": 235470848 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004690070210631896, + "loss": 3.4541, + "theoretical_loss": 4.268194102455395, + "tokens_seen": 235536384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046899699097291875, + "loss": 3.1567, + "theoretical_loss": 4.26804927493567, + "tokens_seen": 235601920 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468986960882648, + "loss": 3.363, + "theoretical_loss": 4.267904498972618, + "tokens_seen": 235667456 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689769307923771, + "loss": 3.1686, + "theoretical_loss": 4.267759774533552, + "tokens_seen": 235732992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046896690070210635, + "loss": 3.3618, + "theoretical_loss": 4.267615101585821, + "tokens_seen": 235798528 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046895687061183553, + "loss": 3.417, + "theoretical_loss": 4.267470480096801, + "tokens_seen": 235864064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 405251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.511091470718384, + "objective/train/theoretical_loss": 4.267325910033897, + "objective/train/tokens_used": 256389600, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689468405215647, + "loss": 3.2676, + "theoretical_loss": 4.267325910033897, + "tokens_seen": 235929600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689368104312939, + "loss": 3.2389, + "theoretical_loss": 4.267181391364547, + "tokens_seen": 235995136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046892678034102307, + "loss": 3.3811, + "theoretical_loss": 4.267036924056215, + "tokens_seen": 236060672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046891675025075225, + "loss": 3.3345, + "theoretical_loss": 4.266892508076397, + "tokens_seen": 236126208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004689067201604815, + "loss": 3.3912, + "theoretical_loss": 4.266748143392617, + "tokens_seen": 236191744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688966900702106, + "loss": 3.2601, + "theoretical_loss": 4.26660382997243, + "tokens_seen": 236257280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046888665997993985, + "loss": 3.2775, + "theoretical_loss": 4.26645956778342, + "tokens_seen": 236322816 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046887662988966903, + "loss": 3.3896, + "theoretical_loss": 4.2663153567932, + "tokens_seen": 236388352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688665997993982, + "loss": 3.3891, + "theoretical_loss": 4.266171196969412, + "tokens_seen": 236453888 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688565697091274, + "loss": 3.2042, + "theoretical_loss": 4.2660270882797295, + "tokens_seen": 236519424 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688465396188566, + "loss": 3.3827, + "theoretical_loss": 4.265883030691853, + "tokens_seen": 236584960 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046883650952858575, + "loss": 3.3702, + "theoretical_loss": 4.265739024173515, + "tokens_seen": 236650496 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468826479438315, + "loss": 3.4168, + "theoretical_loss": 4.265595068692473, + "tokens_seen": 236716032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004688164493480441, + "loss": 3.3531, + "theoretical_loss": 4.26545116421652, + "tokens_seen": 236781568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046880641925777335, + "loss": 3.3188, + "theoretical_loss": 4.265307310713471, + "tokens_seen": 236847104 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687963891675025, + "loss": 3.4284, + "theoretical_loss": 4.2651635081511765, + "tokens_seen": 236912640 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687863590772317, + "loss": 3.4436, + "theoretical_loss": 4.265019756497512, + "tokens_seen": 236978176 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687763289869609, + "loss": 3.2474, + "theoretical_loss": 4.264876055720386, + "tokens_seen": 237043712 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687662988966901, + "loss": 3.3793, + "theoretical_loss": 4.264732405787731, + "tokens_seen": 237109248 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046875626880641926, + "loss": 3.2444, + "theoretical_loss": 4.264588806667513, + "tokens_seen": 237174784 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046874623871614844, + "loss": 3.2761, + "theoretical_loss": 4.264445258327724, + "tokens_seen": 237240320 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687362086258776, + "loss": 3.22, + "theoretical_loss": 4.264301760736389, + "tokens_seen": 237305856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046872617853560686, + "loss": 3.3388, + "theoretical_loss": 4.264158313861557, + "tokens_seen": 237371392 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468716148445336, + "loss": 3.206, + "theoretical_loss": 4.264014917671309, + "tokens_seen": 237436928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004687061183550652, + "loss": 3.5133, + "theoretical_loss": 4.2638715721337554, + "tokens_seen": 237502464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 406700, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1778602600097656, + "objective/train/theoretical_loss": 4.263728277217032, + "objective/train/tokens_used": 258028000, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686960882647944, + "loss": 3.2286, + "theoretical_loss": 4.263728277217032, + "tokens_seen": 237568000 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686860581745236, + "loss": 3.3241, + "theoretical_loss": 4.263585032889306, + "tokens_seen": 237633536 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046867602808425276, + "loss": 3.2052, + "theoretical_loss": 4.263441839118776, + "tokens_seen": 237699072 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046866599799398194, + "loss": 3.2881, + "theoretical_loss": 4.2632986958736625, + "tokens_seen": 237764608 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686559679037111, + "loss": 3.3778, + "theoretical_loss": 4.263155603122221, + "tokens_seen": 237830144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046864593781344036, + "loss": 3.0997, + "theoretical_loss": 4.263012560832733, + "tokens_seen": 237895680 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686359077231695, + "loss": 3.224, + "theoretical_loss": 4.262869568973508, + "tokens_seen": 237961216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686258776328987, + "loss": 3.295, + "theoretical_loss": 4.262726627512886, + "tokens_seen": 238026752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046861584754262785, + "loss": 3.2811, + "theoretical_loss": 4.262583736419234, + "tokens_seen": 238092288 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686058174523571, + "loss": 3.2617, + "theoretical_loss": 4.26244089566095, + "tokens_seen": 238157824 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046859578736208626, + "loss": 3.3202, + "theoretical_loss": 4.262298105206456, + "tokens_seen": 238223360 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046858575727181545, + "loss": 3.1757, + "theoretical_loss": 4.262155365024207, + "tokens_seen": 238288896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685757271815446, + "loss": 3.219, + "theoretical_loss": 4.262012675082685, + "tokens_seen": 238354432 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685656970912738, + "loss": 3.3348, + "theoretical_loss": 4.261870035350399, + "tokens_seen": 238419968 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468555667001003, + "loss": 3.4419, + "theoretical_loss": 4.261727445795888, + "tokens_seen": 238485504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685456369107322, + "loss": 3.4575, + "theoretical_loss": 4.26158490638772, + "tokens_seen": 238551040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046853560682046135, + "loss": 3.4853, + "theoretical_loss": 4.261442417094488, + "tokens_seen": 238616576 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004685255767301906, + "loss": 3.364, + "theoretical_loss": 4.261299977884816, + "tokens_seen": 238682112 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046851554663991977, + "loss": 3.3649, + "theoretical_loss": 4.2611575887273565, + "tokens_seen": 238747648 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046850551654964895, + "loss": 3.3127, + "theoretical_loss": 4.261015249590789, + "tokens_seen": 238813184 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046849548645937813, + "loss": 3.4436, + "theoretical_loss": 4.260872960443822, + "tokens_seen": 238878720 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684854563691073, + "loss": 3.3953, + "theoretical_loss": 4.260730721255191, + "tokens_seen": 238944256 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684754262788365, + "loss": 3.328, + "theoretical_loss": 4.260588531993662, + "tokens_seen": 239009792 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046846539618856573, + "loss": 3.237, + "theoretical_loss": 4.260446392628026, + "tokens_seen": 239075328 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684553660982949, + "loss": 3.3323, + "theoretical_loss": 4.2603043031271035, + "tokens_seen": 239140864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 409530, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4040510654449463, + "objective/train/theoretical_loss": 4.260162263459744, + "objective/train/tokens_used": 259666400, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684453360080241, + "loss": 3.541, + "theoretical_loss": 4.260162263459744, + "tokens_seen": 239206400 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046843530591775327, + "loss": 3.4665, + "theoretical_loss": 4.260020273594824, + "tokens_seen": 239271936 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046842527582748245, + "loss": 3.4293, + "theoretical_loss": 4.259878333501247, + "tokens_seen": 239337472 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684152457372117, + "loss": 3.2033, + "theoretical_loss": 4.259736443147946, + "tokens_seen": 239403008 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004684052156469408, + "loss": 3.2626, + "theoretical_loss": 4.259594602503881, + "tokens_seen": 239468544 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046839518555667005, + "loss": 3.3929, + "theoretical_loss": 4.259452811538041, + "tokens_seen": 239534080 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046838515546639923, + "loss": 3.1486, + "theoretical_loss": 4.259311070219441, + "tokens_seen": 239599616 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683751253761284, + "loss": 3.1868, + "theoretical_loss": 4.259169378517125, + "tokens_seen": 239665152 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683650952858576, + "loss": 3.4297, + "theoretical_loss": 4.259027736400165, + "tokens_seen": 239730688 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683550651955868, + "loss": 3.3967, + "theoretical_loss": 4.258886143837661, + "tokens_seen": 239796224 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046834503510531595, + "loss": 3.2226, + "theoretical_loss": 4.258744600798739, + "tokens_seen": 239861760 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683350050150452, + "loss": 3.0907, + "theoretical_loss": 4.2586031072525525, + "tokens_seen": 239927296 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683249749247743, + "loss": 3.3473, + "theoretical_loss": 4.258461663168285, + "tokens_seen": 239992832 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046831494483450355, + "loss": 3.2662, + "theoretical_loss": 4.258320268515147, + "tokens_seen": 240058368 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683049147442327, + "loss": 3.3587, + "theoretical_loss": 4.258178923262376, + "tokens_seen": 240123904 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682948846539619, + "loss": 3.4883, + "theoretical_loss": 4.258037627379235, + "tokens_seen": 240189440 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682848545636911, + "loss": 3.4508, + "theoretical_loss": 4.257896380835018, + "tokens_seen": 240254976 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682748244734203, + "loss": 3.2885, + "theoretical_loss": 4.257755183599045, + "tokens_seen": 240320512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046826479438314946, + "loss": 3.2756, + "theoretical_loss": 4.257614035640662, + "tokens_seen": 240386048 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046825476429287864, + "loss": 3.4311, + "theoretical_loss": 4.257472936929246, + "tokens_seen": 240451584 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682447342026078, + "loss": 3.2328, + "theoretical_loss": 4.257331887434198, + "tokens_seen": 240517120 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046823470411233706, + "loss": 3.3426, + "theoretical_loss": 4.257190887124946, + "tokens_seen": 240582656 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682246740220662, + "loss": 3.3918, + "theoretical_loss": 4.25704993597095, + "tokens_seen": 240648192 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682146439317954, + "loss": 3.2312, + "theoretical_loss": 4.256909033941691, + "tokens_seen": 240713728 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682046138415246, + "loss": 3.3552, + "theoretical_loss": 4.256768181006683, + "tokens_seen": 240779264 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 412261, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1952106952667236, + "objective/train/theoretical_loss": 4.2566273771354615, + "objective/train/tokens_used": 261304800, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681945837512538, + "loss": 3.2257, + "theoretical_loss": 4.2566273771354615, + "tokens_seen": 240844800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046818455366098296, + "loss": 3.4086, + "theoretical_loss": 4.256486622297595, + "tokens_seen": 240910336 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046817452357071214, + "loss": 3.4526, + "theoretical_loss": 4.256345916462674, + "tokens_seen": 240975872 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681644934804413, + "loss": 3.1397, + "theoretical_loss": 4.256205259600321, + "tokens_seen": 241041408 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046815446339017056, + "loss": 3.3902, + "theoretical_loss": 4.256064651680182, + "tokens_seen": 241106944 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681444332998997, + "loss": 3.2706, + "theoretical_loss": 4.255924092671931, + "tokens_seen": 241172480 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681344032096289, + "loss": 3.2616, + "theoretical_loss": 4.255783582545269, + "tokens_seen": 241238016 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046812437311935805, + "loss": 3.3759, + "theoretical_loss": 4.255643121269924, + "tokens_seen": 241303552 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681143430290873, + "loss": 3.5205, + "theoretical_loss": 4.255502708815651, + "tokens_seen": 241369088 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046810431293881646, + "loss": 3.3384, + "theoretical_loss": 4.255362345152234, + "tokens_seen": 241434624 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046809428284854565, + "loss": 3.1643, + "theoretical_loss": 4.255222030249479, + "tokens_seen": 241500160 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680842527582748, + "loss": 3.3843, + "theoretical_loss": 4.255081764077224, + "tokens_seen": 241565696 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468074222668004, + "loss": 3.3572, + "theoretical_loss": 4.25494154660533, + "tokens_seen": 241631232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680641925777332, + "loss": 3.2495, + "theoretical_loss": 4.254801377803689, + "tokens_seen": 241696768 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680541624874624, + "loss": 3.3032, + "theoretical_loss": 4.254661257642215, + "tokens_seen": 241762304 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046804413239719155, + "loss": 3.3338, + "theoretical_loss": 4.254521186090852, + "tokens_seen": 241827840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004680341023069208, + "loss": 3.2753, + "theoretical_loss": 4.254381163119568, + "tokens_seen": 241893376 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046802407221664997, + "loss": 3.2282, + "theoretical_loss": 4.254241188698361, + "tokens_seen": 241958912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046801404212637915, + "loss": 3.2665, + "theoretical_loss": 4.2541012627972545, + "tokens_seen": 242024448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046800401203610833, + "loss": 3.1205, + "theoretical_loss": 4.2539613853862965, + "tokens_seen": 242089984 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679939819458375, + "loss": 3.4145, + "theoretical_loss": 4.253821556435565, + "tokens_seen": 242155520 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679839518555667, + "loss": 3.3318, + "theoretical_loss": 4.253681775915161, + "tokens_seen": 242221056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046797392176529593, + "loss": 3.2545, + "theoretical_loss": 4.253542043795215, + "tokens_seen": 242286592 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046796389167502505, + "loss": 3.1722, + "theoretical_loss": 4.253402360045882, + "tokens_seen": 242352128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679538615847543, + "loss": 3.2968, + "theoretical_loss": 4.253262724637346, + "tokens_seen": 242417664 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 414980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.376100778579712, + "objective/train/theoretical_loss": 4.253123137539814, + "objective/train/tokens_used": 262943200, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679438314944834, + "loss": 3.322, + "theoretical_loss": 4.253123137539814, + "tokens_seen": 242483200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046793380140421265, + "loss": 3.4644, + "theoretical_loss": 4.252983598723521, + "tokens_seen": 242548736 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046792377131394183, + "loss": 3.3526, + "theoretical_loss": 4.25284410815873, + "tokens_seen": 242614272 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467913741223671, + "loss": 3.4176, + "theoretical_loss": 4.2527046658157275, + "tokens_seen": 242679808 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679037111334002, + "loss": 3.2747, + "theoretical_loss": 4.252565271664828, + "tokens_seen": 242745344 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046789368104312943, + "loss": 2.9976, + "theoretical_loss": 4.252425925676373, + "tokens_seen": 242810880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046788365095285856, + "loss": 3.5564, + "theoretical_loss": 4.252286627820727, + "tokens_seen": 242876416 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678736208625878, + "loss": 3.0342, + "theoretical_loss": 4.252147378068285, + "tokens_seen": 242941952 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678635907723169, + "loss": 3.2689, + "theoretical_loss": 4.252008176389465, + "tokens_seen": 243007488 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046785356068204616, + "loss": 3.2795, + "theoretical_loss": 4.251869022754712, + "tokens_seen": 243073024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046784353059177534, + "loss": 3.3239, + "theoretical_loss": 4.251729917134498, + "tokens_seen": 243138560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678335005015045, + "loss": 3.3601, + "theoretical_loss": 4.251590859499322, + "tokens_seen": 243204096 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678234704112337, + "loss": 3.4365, + "theoretical_loss": 4.251451849819704, + "tokens_seen": 243269632 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004678134403209629, + "loss": 3.361, + "theoretical_loss": 4.251312888066197, + "tokens_seen": 243335168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046780341023069206, + "loss": 3.243, + "theoretical_loss": 4.251173974209375, + "tokens_seen": 243400704 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677933801404213, + "loss": 3.1348, + "theoretical_loss": 4.251035108219839, + "tokens_seen": 243466240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677833500501504, + "loss": 3.3948, + "theoretical_loss": 4.250896290068218, + "tokens_seen": 243531776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046777331995987966, + "loss": 3.3517, + "theoretical_loss": 4.250757519725165, + "tokens_seen": 243597312 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677632898696088, + "loss": 3.2284, + "theoretical_loss": 4.25061879716136, + "tokens_seen": 243662848 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467753259779338, + "loss": 3.3484, + "theoretical_loss": 4.250480122347507, + "tokens_seen": 243728384 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677432296890672, + "loss": 3.2918, + "theoretical_loss": 4.250341495254337, + "tokens_seen": 243793920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677331995987964, + "loss": 3.3152, + "theoretical_loss": 4.250202915852608, + "tokens_seen": 243859456 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046772316950852556, + "loss": 3.2372, + "theoretical_loss": 4.250064384113102, + "tokens_seen": 243924992 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677131394182548, + "loss": 3.4017, + "theoretical_loss": 4.249925900006627, + "tokens_seen": 243990528 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467703109327984, + "loss": 3.2153, + "theoretical_loss": 4.249787463504019, + "tokens_seen": 244056064 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 418010, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.642868995666504, + "objective/train/theoretical_loss": 4.249649074576134, + "objective/train/tokens_used": 264581600, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046769307923771316, + "loss": 3.4846, + "theoretical_loss": 4.249649074576134, + "tokens_seen": 244121600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046768304914744234, + "loss": 3.3416, + "theoretical_loss": 4.249510733193862, + "tokens_seen": 244187136 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676730190571715, + "loss": 3.1832, + "theoretical_loss": 4.249372439328111, + "tokens_seen": 244252672 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046766298896690076, + "loss": 3.2903, + "theoretical_loss": 4.249234192949818, + "tokens_seen": 244318208 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676529588766299, + "loss": 3.4449, + "theoretical_loss": 4.249095994029947, + "tokens_seen": 244383744 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676429287863591, + "loss": 3.4345, + "theoretical_loss": 4.248957842539484, + "tokens_seen": 244449280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046763289869608825, + "loss": 3.2594, + "theoretical_loss": 4.248819738449442, + "tokens_seen": 244514816 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004676228686058175, + "loss": 3.1658, + "theoretical_loss": 4.2486816817308615, + "tokens_seen": 244580352 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046761283851554666, + "loss": 3.3979, + "theoretical_loss": 4.248543672354805, + "tokens_seen": 244645888 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046760280842527585, + "loss": 3.488, + "theoretical_loss": 4.248405710292364, + "tokens_seen": 244711424 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467592778335005, + "loss": 3.433, + "theoretical_loss": 4.248267795514652, + "tokens_seen": 244776960 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675827482447342, + "loss": 3.326, + "theoretical_loss": 4.248129927992808, + "tokens_seen": 244842496 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675727181544634, + "loss": 3.3601, + "theoretical_loss": 4.247992107698002, + "tokens_seen": 244908032 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675626880641926, + "loss": 3.2576, + "theoretical_loss": 4.247854334601421, + "tokens_seen": 244973568 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046755265797392175, + "loss": 3.3217, + "theoretical_loss": 4.247716608674283, + "tokens_seen": 245039104 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467542627883651, + "loss": 3.2677, + "theoretical_loss": 4.247578929887829, + "tokens_seen": 245104640 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046753259779338017, + "loss": 3.378, + "theoretical_loss": 4.247441298213326, + "tokens_seen": 245170176 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046752256770310935, + "loss": 3.2138, + "theoretical_loss": 4.247303713622067, + "tokens_seen": 245235712 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046751253761283853, + "loss": 3.1309, + "theoretical_loss": 4.247166176085367, + "tokens_seen": 245301248 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004675025075225677, + "loss": 3.2846, + "theoretical_loss": 4.247028685574569, + "tokens_seen": 245366784 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674924774322969, + "loss": 3.1897, + "theoretical_loss": 4.246891242061041, + "tokens_seen": 245432320 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046748244734202613, + "loss": 3.4708, + "theoretical_loss": 4.246753845516174, + "tokens_seen": 245497856 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046747241725175525, + "loss": 3.3002, + "theoretical_loss": 4.246616495911388, + "tokens_seen": 245563392 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674623871614845, + "loss": 3.1398, + "theoretical_loss": 4.246479193218123, + "tokens_seen": 245628928 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674523570712136, + "loss": 3.2185, + "theoretical_loss": 4.246341937407848, + "tokens_seen": 245694464 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 420862, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.6693809032440186, + "objective/train/theoretical_loss": 4.246204728452055, + "objective/train/tokens_used": 266220000, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046744232698094285, + "loss": 3.4755, + "theoretical_loss": 4.246204728452055, + "tokens_seen": 245760000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046743229689067203, + "loss": 3.0774, + "theoretical_loss": 4.246067566322259, + "tokens_seen": 245825536 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674222668004012, + "loss": 3.2301, + "theoretical_loss": 4.245930450990007, + "tokens_seen": 245891072 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004674122367101304, + "loss": 3.1468, + "theoretical_loss": 4.245793382426861, + "tokens_seen": 245956608 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046740220661985963, + "loss": 3.2237, + "theoretical_loss": 4.245656360604417, + "tokens_seen": 246022144 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046739217652958876, + "loss": 3.2605, + "theoretical_loss": 4.24551938549429, + "tokens_seen": 246087680 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467382146439318, + "loss": 3.1747, + "theoretical_loss": 4.2453824570681205, + "tokens_seen": 246153216 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673721163490471, + "loss": 3.2787, + "theoretical_loss": 4.245245575297577, + "tokens_seen": 246218752 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046736208625877636, + "loss": 3.1383, + "theoretical_loss": 4.2451087401543495, + "tokens_seen": 246284288 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046735205616850554, + "loss": 3.3935, + "theoretical_loss": 4.244971951610154, + "tokens_seen": 246349824 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673420260782347, + "loss": 3.3626, + "theoretical_loss": 4.24483520963673, + "tokens_seen": 246415360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673319959879639, + "loss": 3.2432, + "theoretical_loss": 4.244698514205844, + "tokens_seen": 246480896 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673219658976931, + "loss": 2.9633, + "theoretical_loss": 4.244561865289285, + "tokens_seen": 246546432 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046731193580742226, + "loss": 3.3363, + "theoretical_loss": 4.244425262858867, + "tokens_seen": 246611968 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673019057171515, + "loss": 3.3936, + "theoretical_loss": 4.2442887068864295, + "tokens_seen": 246677504 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672918756268806, + "loss": 3.3798, + "theoretical_loss": 4.244152197343835, + "tokens_seen": 246743040 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046728184553660986, + "loss": 3.3853, + "theoretical_loss": 4.244015734202973, + "tokens_seen": 246808576 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467271815446339, + "loss": 3.1934, + "theoretical_loss": 4.243879317435755, + "tokens_seen": 246874112 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672617853560682, + "loss": 3.2353, + "theoretical_loss": 4.243742947014117, + "tokens_seen": 246939648 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672517552657974, + "loss": 3.2925, + "theoretical_loss": 4.243606622910021, + "tokens_seen": 247005184 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672417251755266, + "loss": 3.1572, + "theoretical_loss": 4.243470345095453, + "tokens_seen": 247070720 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046723169508525576, + "loss": 3.0776, + "theoretical_loss": 4.2433341135424225, + "tokens_seen": 247136256 + }, + { + "epoch": 0.07, + "learning_rate": 0.000467221664994985, + "loss": 3.4207, + "theoretical_loss": 4.243197928222964, + "tokens_seen": 247201792 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672116349047141, + "loss": 3.3876, + "theoretical_loss": 4.243061789109136, + "tokens_seen": 247267328 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046720160481444336, + "loss": 3.1718, + "theoretical_loss": 4.242925696173021, + "tokens_seen": 247332864 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 422249, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.614682674407959, + "objective/train/theoretical_loss": 4.2427896493867285, + "objective/train/tokens_used": 267858400, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671915747241725, + "loss": 3.3472, + "theoretical_loss": 4.2427896493867285, + "tokens_seen": 247398400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004671815446339017, + "loss": 3.2511, + "theoretical_loss": 4.242653648722387, + "tokens_seen": 247463936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671715145436309, + "loss": 3.2284, + "theoretical_loss": 4.242517694152154, + "tokens_seen": 247529472 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671614844533601, + "loss": 3.1887, + "theoretical_loss": 4.24238178564821, + "tokens_seen": 247595008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046715145436308927, + "loss": 3.2476, + "theoretical_loss": 4.242245923182756, + "tokens_seen": 247660544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046714142427281845, + "loss": 3.3871, + "theoretical_loss": 4.242110106728022, + "tokens_seen": 247726080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046713139418254763, + "loss": 3.2798, + "theoretical_loss": 4.241974336256261, + "tokens_seen": 247791616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046712136409227686, + "loss": 2.9492, + "theoretical_loss": 4.241838611739748, + "tokens_seen": 247857152 + }, + { + "epoch": 0.08, + "learning_rate": 0.000467111334002006, + "loss": 3.3364, + "theoretical_loss": 4.241702933150783, + "tokens_seen": 247922688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004671013039117352, + "loss": 3.5392, + "theoretical_loss": 4.241567300461693, + "tokens_seen": 247988224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046709127382146435, + "loss": 3.2121, + "theoretical_loss": 4.241431713644823, + "tokens_seen": 248053760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670812437311936, + "loss": 3.1965, + "theoretical_loss": 4.241296172672547, + "tokens_seen": 248119296 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046707121364092277, + "loss": 3.3442, + "theoretical_loss": 4.24116067751726, + "tokens_seen": 248184832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046706118355065195, + "loss": 3.3514, + "theoretical_loss": 4.241025228151383, + "tokens_seen": 248250368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046705115346038113, + "loss": 3.3263, + "theoretical_loss": 4.24088982454736, + "tokens_seen": 248315904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046704112337011037, + "loss": 3.3149, + "theoretical_loss": 4.240754466677659, + "tokens_seen": 248381440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670310932798395, + "loss": 3.2626, + "theoretical_loss": 4.240619154514771, + "tokens_seen": 248446976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046702106318956873, + "loss": 3.246, + "theoretical_loss": 4.240483888031212, + "tokens_seen": 248512512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046701103309929786, + "loss": 3.2991, + "theoretical_loss": 4.240348667199521, + "tokens_seen": 248578048 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004670010030090271, + "loss": 3.3613, + "theoretical_loss": 4.240213491992261, + "tokens_seen": 248643584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669909729187563, + "loss": 3.3745, + "theoretical_loss": 4.240078362382019, + "tokens_seen": 248709120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046698094282848545, + "loss": 3.3737, + "theoretical_loss": 4.239943278341404, + "tokens_seen": 248774656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046697091273821464, + "loss": 3.2759, + "theoretical_loss": 4.239808239843052, + "tokens_seen": 248840192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669608826479438, + "loss": 3.1217, + "theoretical_loss": 4.239673246859619, + "tokens_seen": 248905728 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046695085255767305, + "loss": 3.1495, + "theoretical_loss": 4.239538299363788, + "tokens_seen": 248971264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 425137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2362241744995117, + "objective/train/theoretical_loss": 4.239403397328261, + "objective/train/tokens_used": 269496800, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046694082246740223, + "loss": 3.3189, + "theoretical_loss": 4.239403397328261, + "tokens_seen": 249036800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669307923771314, + "loss": 3.3616, + "theoretical_loss": 4.239268540725769, + "tokens_seen": 249102336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669207622868606, + "loss": 3.3148, + "theoretical_loss": 4.239133729529064, + "tokens_seen": 249167872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046691073219658983, + "loss": 3.4464, + "theoretical_loss": 4.2389989637109196, + "tokens_seen": 249233408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046690070210631896, + "loss": 3.1552, + "theoretical_loss": 4.2388642432441355, + "tokens_seen": 249298944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668906720160482, + "loss": 3.3609, + "theoretical_loss": 4.238729568101535, + "tokens_seen": 249364480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668806419257773, + "loss": 3.2651, + "theoretical_loss": 4.238594938255963, + "tokens_seen": 249430016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046687061183550656, + "loss": 3.3861, + "theoretical_loss": 4.2384603536802885, + "tokens_seen": 249495552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046686058174523574, + "loss": 3.3166, + "theoretical_loss": 4.238325814347404, + "tokens_seen": 249561088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668505516549649, + "loss": 3.2519, + "theoretical_loss": 4.238191320230227, + "tokens_seen": 249626624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668405215646941, + "loss": 3.2741, + "theoretical_loss": 4.238056871301695, + "tokens_seen": 249692160 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668304914744233, + "loss": 3.1328, + "theoretical_loss": 4.237922467534771, + "tokens_seen": 249757696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046682046138415246, + "loss": 3.2581, + "theoretical_loss": 4.237788108902441, + "tokens_seen": 249823232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668104312938817, + "loss": 3.2584, + "theoretical_loss": 4.237653795377714, + "tokens_seen": 249888768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668004012036108, + "loss": 3.4916, + "theoretical_loss": 4.237519526933622, + "tokens_seen": 249954304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046679037111334006, + "loss": 3.2192, + "theoretical_loss": 4.2373853035432205, + "tokens_seen": 250019840 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667803410230692, + "loss": 3.2908, + "theoretical_loss": 4.237251125179588, + "tokens_seen": 250085376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667703109327984, + "loss": 3.3605, + "theoretical_loss": 4.237116991815826, + "tokens_seen": 250150912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667602808425276, + "loss": 3.3276, + "theoretical_loss": 4.23698290342506, + "tokens_seen": 250216448 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667502507522568, + "loss": 3.3852, + "theoretical_loss": 4.236848859980437, + "tokens_seen": 250281984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046674022066198596, + "loss": 3.3049, + "theoretical_loss": 4.23671486145513, + "tokens_seen": 250347520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667301905717152, + "loss": 3.2814, + "theoretical_loss": 4.236580907822331, + "tokens_seen": 250413056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667201604814443, + "loss": 3.1424, + "theoretical_loss": 4.236446999055257, + "tokens_seen": 250478592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046671013039117356, + "loss": 3.3069, + "theoretical_loss": 4.2363131351271495, + "tokens_seen": 250544128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667001003009027, + "loss": 3.3523, + "theoretical_loss": 4.2361793160112695, + "tokens_seen": 250609664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 428605, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.342467784881592, + "objective/train/theoretical_loss": 4.236045541680905, + "objective/train/tokens_used": 271135200, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666900702106319, + "loss": 3.0975, + "theoretical_loss": 4.236045541680905, + "tokens_seen": 250675200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666800401203611, + "loss": 3.2809, + "theoretical_loss": 4.235911812109363, + "tokens_seen": 250740736 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666700100300903, + "loss": 3.3463, + "theoretical_loss": 4.235778127269976, + "tokens_seen": 250806272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046665997993981947, + "loss": 3.1909, + "theoretical_loss": 4.235644487136098, + "tokens_seen": 250871808 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046664994984954865, + "loss": 3.2503, + "theoretical_loss": 4.235510891681108, + "tokens_seen": 250937344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046663991975927783, + "loss": 3.3007, + "theoretical_loss": 4.235377340878404, + "tokens_seen": 251002880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046662988966900706, + "loss": 3.2182, + "theoretical_loss": 4.23524383470141, + "tokens_seen": 251068416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004666198595787362, + "loss": 3.1153, + "theoretical_loss": 4.235110373123572, + "tokens_seen": 251133952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046660982948846543, + "loss": 3.27, + "theoretical_loss": 4.2349769561183574, + "tokens_seen": 251199488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046659979939819455, + "loss": 3.2842, + "theoretical_loss": 4.2348435836592575, + "tokens_seen": 251265024 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665897693079238, + "loss": 3.4081, + "theoretical_loss": 4.234710255719786, + "tokens_seen": 251330560 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046657973921765297, + "loss": 3.33, + "theoretical_loss": 4.234576972273481, + "tokens_seen": 251396096 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046656970912738215, + "loss": 3.4677, + "theoretical_loss": 4.234443733293899, + "tokens_seen": 251461632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046655967903711133, + "loss": 3.3585, + "theoretical_loss": 4.234310538754624, + "tokens_seen": 251527168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046654964894684057, + "loss": 3.4218, + "theoretical_loss": 4.2341773886292575, + "tokens_seen": 251592704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665396188565697, + "loss": 3.4887, + "theoretical_loss": 4.234044282891429, + "tokens_seen": 251658240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046652958876629893, + "loss": 3.3558, + "theoretical_loss": 4.233911221514787, + "tokens_seen": 251723776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046651955867602806, + "loss": 3.3855, + "theoretical_loss": 4.233778204473002, + "tokens_seen": 251789312 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665095285857573, + "loss": 3.3142, + "theoretical_loss": 4.23364523173977, + "tokens_seen": 251854848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664994984954865, + "loss": 3.4689, + "theoretical_loss": 4.233512303288807, + "tokens_seen": 251920384 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046648946840521565, + "loss": 3.3657, + "theoretical_loss": 4.233379419093851, + "tokens_seen": 251985920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046647943831494484, + "loss": 3.2658, + "theoretical_loss": 4.233246579128666, + "tokens_seen": 252051456 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466469408224674, + "loss": 3.1877, + "theoretical_loss": 4.233113783367033, + "tokens_seen": 252116992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664593781344032, + "loss": 3.344, + "theoretical_loss": 4.232981031782761, + "tokens_seen": 252182528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046644934804413243, + "loss": 3.3379, + "theoretical_loss": 4.232848324349677, + "tokens_seen": 252248064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 430057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5110082626342773, + "objective/train/theoretical_loss": 4.232715661041632, + "objective/train/tokens_used": 272773600, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046643931795386156, + "loss": 3.2061, + "theoretical_loss": 4.232715661041632, + "tokens_seen": 252313600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664292878635908, + "loss": 3.0991, + "theoretical_loss": 4.232583041832499, + "tokens_seen": 252379136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664192577733199, + "loss": 3.3572, + "theoretical_loss": 4.232450466696174, + "tokens_seen": 252444672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046640922768304916, + "loss": 3.1959, + "theoretical_loss": 4.2323179356065745, + "tokens_seen": 252510208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046639919759277834, + "loss": 3.233, + "theoretical_loss": 4.23218544853764, + "tokens_seen": 252575744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663891675025075, + "loss": 3.274, + "theoretical_loss": 4.232053005463333, + "tokens_seen": 252641280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663791374122367, + "loss": 3.2249, + "theoretical_loss": 4.231920606357638, + "tokens_seen": 252706816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046636910732196594, + "loss": 3.4196, + "theoretical_loss": 4.231788251194559, + "tokens_seen": 252772352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046635907723169506, + "loss": 3.2772, + "theoretical_loss": 4.231655939948127, + "tokens_seen": 252837888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663490471414243, + "loss": 3.0901, + "theoretical_loss": 4.231523672592392, + "tokens_seen": 252903424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663390170511534, + "loss": 3.3559, + "theoretical_loss": 4.231391449101425, + "tokens_seen": 252968960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046632898696088266, + "loss": 3.4577, + "theoretical_loss": 4.231259269449322, + "tokens_seen": 253034496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046631895687061184, + "loss": 3.3075, + "theoretical_loss": 4.231127133610198, + "tokens_seen": 253100032 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466308926780341, + "loss": 3.1986, + "theoretical_loss": 4.230995041558194, + "tokens_seen": 253165568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662988966900702, + "loss": 3.3125, + "theoretical_loss": 4.230862993267468, + "tokens_seen": 253231104 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662888665997994, + "loss": 3.2639, + "theoretical_loss": 4.230730988712205, + "tokens_seen": 253296640 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046627883650952857, + "loss": 3.4571, + "theoretical_loss": 4.230599027866606, + "tokens_seen": 253362176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662688064192578, + "loss": 3.2781, + "theoretical_loss": 4.2304671107048994, + "tokens_seen": 253427712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046625877632898693, + "loss": 3.1914, + "theoretical_loss": 4.2303352372013325, + "tokens_seen": 253493248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046624874623871616, + "loss": 3.3246, + "theoretical_loss": 4.230203407330176, + "tokens_seen": 253558784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046623871614844535, + "loss": 3.3088, + "theoretical_loss": 4.230071621065721, + "tokens_seen": 253624320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662286860581745, + "loss": 3.3185, + "theoretical_loss": 4.2299398783822815, + "tokens_seen": 253689856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662186559679037, + "loss": 3.1814, + "theoretical_loss": 4.229808179254192, + "tokens_seen": 253755392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004662086258776329, + "loss": 3.2235, + "theoretical_loss": 4.22967652365581, + "tokens_seen": 253820928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661985957873621, + "loss": 3.3586, + "theoretical_loss": 4.229544911561513, + "tokens_seen": 253886464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 433011, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.399458646774292, + "objective/train/theoretical_loss": 4.229413342945703, + "objective/train/tokens_used": 274412000, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661885656970913, + "loss": 3.262, + "theoretical_loss": 4.229413342945703, + "tokens_seen": 253952000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661785356068205, + "loss": 3.2041, + "theoretical_loss": 4.229281817782801, + "tokens_seen": 254017536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046616850551654967, + "loss": 3.2659, + "theoretical_loss": 4.229150336047251, + "tokens_seen": 254083072 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046615847542627885, + "loss": 3.2659, + "theoretical_loss": 4.229018897713519, + "tokens_seen": 254148608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046614844533600803, + "loss": 3.2829, + "theoretical_loss": 4.22888750275609, + "tokens_seen": 254214144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046613841524573727, + "loss": 3.311, + "theoretical_loss": 4.228756151149475, + "tokens_seen": 254279680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004661283851554664, + "loss": 3.3988, + "theoretical_loss": 4.228624842868202, + "tokens_seen": 254345216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046611835506519563, + "loss": 3.3105, + "theoretical_loss": 4.228493577886824, + "tokens_seen": 254410752 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046610832497492475, + "loss": 3.3131, + "theoretical_loss": 4.228362356179913, + "tokens_seen": 254476288 + }, + { + "epoch": 0.08, + "learning_rate": 0.000466098294884654, + "loss": 3.057, + "theoretical_loss": 4.228231177722063, + "tokens_seen": 254541824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046608826479438317, + "loss": 3.1909, + "theoretical_loss": 4.228100042487892, + "tokens_seen": 254607360 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046607823470411235, + "loss": 3.2962, + "theoretical_loss": 4.227968950452035, + "tokens_seen": 254672896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046606820461384153, + "loss": 3.3027, + "theoretical_loss": 4.227837901589153, + "tokens_seen": 254738432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046605817452357077, + "loss": 3.3067, + "theoretical_loss": 4.227706895873924, + "tokens_seen": 254803968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660481444332999, + "loss": 3.3149, + "theoretical_loss": 4.227575933281051, + "tokens_seen": 254869504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046603811434302913, + "loss": 3.1916, + "theoretical_loss": 4.227445013785257, + "tokens_seen": 254935040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046602808425275826, + "loss": 3.3126, + "theoretical_loss": 4.227314137361285, + "tokens_seen": 255000576 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660180541624875, + "loss": 3.3594, + "theoretical_loss": 4.227183303983901, + "tokens_seen": 255066112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004660080240722167, + "loss": 3.4086, + "theoretical_loss": 4.227052513627893, + "tokens_seen": 255131648 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046599799398194586, + "loss": 3.3451, + "theoretical_loss": 4.226921766268067, + "tokens_seen": 255197184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046598796389167504, + "loss": 3.4092, + "theoretical_loss": 4.226791061879253, + "tokens_seen": 255262720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659779338014042, + "loss": 3.3499, + "theoretical_loss": 4.226660400436302, + "tokens_seen": 255328256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659679037111334, + "loss": 3.287, + "theoretical_loss": 4.226529781914084, + "tokens_seen": 255393792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046595787362086263, + "loss": 3.5984, + "theoretical_loss": 4.226399206287493, + "tokens_seen": 255459328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046594784353059176, + "loss": 3.2001, + "theoretical_loss": 4.226268673531442, + "tokens_seen": 255524864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 435787, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9447875022888184, + "objective/train/theoretical_loss": 4.226138183620867, + "objective/train/tokens_used": 276050400, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465937813440321, + "loss": 3.2321, + "theoretical_loss": 4.226138183620867, + "tokens_seen": 255590400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659277833500501, + "loss": 3.2701, + "theoretical_loss": 4.226007736530723, + "tokens_seen": 255655936 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046591775325977936, + "loss": 3.1593, + "theoretical_loss": 4.225877332235987, + "tokens_seen": 255721472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046590772316950854, + "loss": 3.244, + "theoretical_loss": 4.225746970711657, + "tokens_seen": 255787008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658976930792377, + "loss": 3.3587, + "theoretical_loss": 4.225616651932753, + "tokens_seen": 255852544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658876629889669, + "loss": 3.3306, + "theoretical_loss": 4.225486375874315, + "tokens_seen": 255918080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046587763289869614, + "loss": 3.1894, + "theoretical_loss": 4.225356142511402, + "tokens_seen": 255983616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046586760280842526, + "loss": 3.2634, + "theoretical_loss": 4.225225951819099, + "tokens_seen": 256049152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658575727181545, + "loss": 3.1913, + "theoretical_loss": 4.225095803772507, + "tokens_seen": 256114688 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658475426278836, + "loss": 3.397, + "theoretical_loss": 4.22496569834675, + "tokens_seen": 256180224 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046583751253761286, + "loss": 3.3297, + "theoretical_loss": 4.224835635516973, + "tokens_seen": 256245760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046582748244734204, + "loss": 3.2385, + "theoretical_loss": 4.224705615258341, + "tokens_seen": 256311296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658174523570712, + "loss": 3.0089, + "theoretical_loss": 4.224575637546041, + "tokens_seen": 256376832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658074222668004, + "loss": 3.3524, + "theoretical_loss": 4.224445702355279, + "tokens_seen": 256442368 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657973921765296, + "loss": 3.4306, + "theoretical_loss": 4.2243158096612845, + "tokens_seen": 256507904 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046578736208625877, + "loss": 3.1913, + "theoretical_loss": 4.224185959439305, + "tokens_seen": 256573440 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465777331995988, + "loss": 3.2028, + "theoretical_loss": 4.22405615166461, + "tokens_seen": 256638976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046576730190571713, + "loss": 3.3071, + "theoretical_loss": 4.22392638631249, + "tokens_seen": 256704512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046575727181544636, + "loss": 3.2178, + "theoretical_loss": 4.223796663358255, + "tokens_seen": 256770048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046574724172517555, + "loss": 3.3589, + "theoretical_loss": 4.223666982777237, + "tokens_seen": 256835584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657372116349047, + "loss": 3.3116, + "theoretical_loss": 4.223537344544788, + "tokens_seen": 256901120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657271815446339, + "loss": 3.2625, + "theoretical_loss": 4.223407748636282, + "tokens_seen": 256966656 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004657171514543631, + "loss": 3.1903, + "theoretical_loss": 4.22327819502711, + "tokens_seen": 257032192 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046570712136409227, + "loss": 3.3744, + "theoretical_loss": 4.223148683692687, + "tokens_seen": 257097728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656970912738215, + "loss": 3.3494, + "theoretical_loss": 4.223019214608446, + "tokens_seen": 257163264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 438564, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.376378059387207, + "objective/train/theoretical_loss": 4.222889787749845, + "objective/train/tokens_used": 277688800, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046568706118355063, + "loss": 3.3061, + "theoretical_loss": 4.222889787749845, + "tokens_seen": 257228800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046567703109327987, + "loss": 3.3501, + "theoretical_loss": 4.222760403092358, + "tokens_seen": 257294336 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465667001003009, + "loss": 3.2247, + "theoretical_loss": 4.22263106061148, + "tokens_seen": 257359872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046565697091273823, + "loss": 3.4132, + "theoretical_loss": 4.222501760282729, + "tokens_seen": 257425408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656469408224674, + "loss": 3.2869, + "theoretical_loss": 4.22237250208164, + "tokens_seen": 257490944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656369107321966, + "loss": 3.3746, + "theoretical_loss": 4.222243285983772, + "tokens_seen": 257556480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656268806419258, + "loss": 3.2052, + "theoretical_loss": 4.222114111964703, + "tokens_seen": 257622016 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046561685055165495, + "loss": 3.1544, + "theoretical_loss": 4.221984980000029, + "tokens_seen": 257687552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046560682046138414, + "loss": 3.1066, + "theoretical_loss": 4.2218558900653695, + "tokens_seen": 257753088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046559679037111337, + "loss": 3.2498, + "theoretical_loss": 4.221726842136364, + "tokens_seen": 257818624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655867602808425, + "loss": 3.2062, + "theoretical_loss": 4.2215978361886695, + "tokens_seen": 257884160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046557673019057173, + "loss": 3.1854, + "theoretical_loss": 4.221468872197967, + "tokens_seen": 257949696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655667001003009, + "loss": 3.2238, + "theoretical_loss": 4.221339950139956, + "tokens_seen": 258015232 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655566700100301, + "loss": 3.4212, + "theoretical_loss": 4.221211069990357, + "tokens_seen": 258080768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655466399197593, + "loss": 3.1837, + "theoretical_loss": 4.221082231724908, + "tokens_seen": 258146304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046553660982948846, + "loss": 3.2963, + "theoretical_loss": 4.22095343531937, + "tokens_seen": 258211840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046552657973921764, + "loss": 3.1965, + "theoretical_loss": 4.220824680749525, + "tokens_seen": 258277376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655165496489469, + "loss": 3.1209, + "theoretical_loss": 4.220695967991171, + "tokens_seen": 258342912 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465506519558676, + "loss": 3.3436, + "theoretical_loss": 4.220567297020131, + "tokens_seen": 258408448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046549648946840524, + "loss": 3.2551, + "theoretical_loss": 4.220438667812244, + "tokens_seen": 258473984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046548645937813436, + "loss": 3.4402, + "theoretical_loss": 4.220310080343373, + "tokens_seen": 258539520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654764292878636, + "loss": 3.3646, + "theoretical_loss": 4.220181534589398, + "tokens_seen": 258605056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654663991975928, + "loss": 3.1914, + "theoretical_loss": 4.22005303052622, + "tokens_seen": 258670592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046545636910732196, + "loss": 3.3784, + "theoretical_loss": 4.219924568129759, + "tokens_seen": 258736128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654463390170512, + "loss": 3.1863, + "theoretical_loss": 4.219796147375957, + "tokens_seen": 258801664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 441230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4770009517669678, + "objective/train/theoretical_loss": 4.219667768240775, + "objective/train/tokens_used": 279327200, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654363089267803, + "loss": 3.3003, + "theoretical_loss": 4.219667768240775, + "tokens_seen": 258867200 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046542627883650956, + "loss": 3.2892, + "theoretical_loss": 4.219539430700195, + "tokens_seen": 258932736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046541624874623874, + "loss": 3.2672, + "theoretical_loss": 4.2194111347302155, + "tokens_seen": 258998272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654062186559679, + "loss": 3.3723, + "theoretical_loss": 4.219282880306859, + "tokens_seen": 259063808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653961885656971, + "loss": 3.3666, + "theoretical_loss": 4.219154667406166, + "tokens_seen": 259129344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046538615847542634, + "loss": 3.1243, + "theoretical_loss": 4.219026496004198, + "tokens_seen": 259194880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046537612838515546, + "loss": 3.3527, + "theoretical_loss": 4.218898366077035, + "tokens_seen": 259260416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653660982948847, + "loss": 3.1175, + "theoretical_loss": 4.218770277600775, + "tokens_seen": 259325952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653560682046138, + "loss": 3.1829, + "theoretical_loss": 4.218642230551541, + "tokens_seen": 259391488 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046534603811434306, + "loss": 3.424, + "theoretical_loss": 4.218514224905472, + "tokens_seen": 259457024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046533600802407224, + "loss": 3.2321, + "theoretical_loss": 4.218386260638727, + "tokens_seen": 259522560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653259779338014, + "loss": 3.2345, + "theoretical_loss": 4.2182583377274865, + "tokens_seen": 259588096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653159478435306, + "loss": 3.349, + "theoretical_loss": 4.218130456147948, + "tokens_seen": 259653632 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004653059177532598, + "loss": 3.1048, + "theoretical_loss": 4.218002615876332, + "tokens_seen": 259719168 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046529588766298897, + "loss": 3.4085, + "theoretical_loss": 4.217874816888877, + "tokens_seen": 259784704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652858575727182, + "loss": 3.1914, + "theoretical_loss": 4.217747059161839, + "tokens_seen": 259850240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046527582748244733, + "loss": 3.1396, + "theoretical_loss": 4.217619342671498, + "tokens_seen": 259915776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046526579739217656, + "loss": 3.1498, + "theoretical_loss": 4.2174916673941505, + "tokens_seen": 259981312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046525576730190575, + "loss": 3.3539, + "theoretical_loss": 4.217364033306113, + "tokens_seen": 260046848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652457372116349, + "loss": 3.1146, + "theoretical_loss": 4.217236440383724, + "tokens_seen": 260112384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652357071213641, + "loss": 3.2955, + "theoretical_loss": 4.217108888603337, + "tokens_seen": 260177920 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652256770310933, + "loss": 3.2565, + "theoretical_loss": 4.21698137794133, + "tokens_seen": 260243456 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046521564694082247, + "loss": 3.2793, + "theoretical_loss": 4.216853908374097, + "tokens_seen": 260308992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652056168505517, + "loss": 3.2651, + "theoretical_loss": 4.216726479878052, + "tokens_seen": 260374528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046519558676028083, + "loss": 3.2512, + "theoretical_loss": 4.216599092429631, + "tokens_seen": 260440064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 444102, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.212538719177246, + "objective/train/theoretical_loss": 4.216471746005286, + "objective/train/tokens_used": 280965600, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046518555667001007, + "loss": 3.2993, + "theoretical_loss": 4.216471746005286, + "tokens_seen": 260505600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651755265797392, + "loss": 3.347, + "theoretical_loss": 4.216344440581491, + "tokens_seen": 260571136 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046516549648946843, + "loss": 3.4183, + "theoretical_loss": 4.2162171761347365, + "tokens_seen": 260636672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651554663991976, + "loss": 3.4345, + "theoretical_loss": 4.2160899526415365, + "tokens_seen": 260702208 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651454363089268, + "loss": 3.2886, + "theoretical_loss": 4.215962770078422, + "tokens_seen": 260767744 + }, + { + "epoch": 0.08, + "learning_rate": 0.000465135406218656, + "loss": 3.1121, + "theoretical_loss": 4.215835628421942, + "tokens_seen": 260833280 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046512537612838515, + "loss": 3.2858, + "theoretical_loss": 4.215708527648667, + "tokens_seen": 260898816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046511534603811434, + "loss": 3.3236, + "theoretical_loss": 4.215581467735187, + "tokens_seen": 260964352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046510531594784357, + "loss": 3.3611, + "theoretical_loss": 4.215454448658109, + "tokens_seen": 261029888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650952858575727, + "loss": 3.3669, + "theoretical_loss": 4.215327470394062, + "tokens_seen": 261095424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046508525576730193, + "loss": 3.2499, + "theoretical_loss": 4.215200532919691, + "tokens_seen": 261160960 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650752256770311, + "loss": 3.2358, + "theoretical_loss": 4.215073636211664, + "tokens_seen": 261226496 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650651955867603, + "loss": 3.2307, + "theoretical_loss": 4.214946780246666, + "tokens_seen": 261292032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650551654964895, + "loss": 3.1684, + "theoretical_loss": 4.214819965001401, + "tokens_seen": 261357568 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046504513540621866, + "loss": 3.2824, + "theoretical_loss": 4.214693190452593, + "tokens_seen": 261423104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046503510531594784, + "loss": 3.288, + "theoretical_loss": 4.214566456576984, + "tokens_seen": 261488640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650250752256771, + "loss": 3.1579, + "theoretical_loss": 4.214439763351336, + "tokens_seen": 261554176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650150451354062, + "loss": 3.154, + "theoretical_loss": 4.214313110752431, + "tokens_seen": 261619712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046500501504513544, + "loss": 3.2123, + "theoretical_loss": 4.214186498757069, + "tokens_seen": 261685248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046499498495486456, + "loss": 3.3921, + "theoretical_loss": 4.214059927342068, + "tokens_seen": 261750784 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649849548645938, + "loss": 3.3091, + "theoretical_loss": 4.213933396484267, + "tokens_seen": 261816320 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464974924774323, + "loss": 3.2964, + "theoretical_loss": 4.213806906160523, + "tokens_seen": 261881856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046496489468405216, + "loss": 3.3181, + "theoretical_loss": 4.213680456347712, + "tokens_seen": 261947392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046495486459378134, + "loss": 3.2708, + "theoretical_loss": 4.213554047022729, + "tokens_seen": 262012928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649448345035105, + "loss": 3.2509, + "theoretical_loss": 4.213427678162489, + "tokens_seen": 262078464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 446981, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1180131435394287, + "objective/train/theoretical_loss": 4.213301349743924, + "objective/train/tokens_used": 282604000, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649348044132397, + "loss": 3.068, + "theoretical_loss": 4.213301349743924, + "tokens_seen": 262144000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046492477432296894, + "loss": 3.5101, + "theoretical_loss": 4.2131750617439865, + "tokens_seen": 262209536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046491474423269807, + "loss": 3.3962, + "theoretical_loss": 4.213048814139647, + "tokens_seen": 262275072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649047141424273, + "loss": 3.3437, + "theoretical_loss": 4.212922606907895, + "tokens_seen": 262340608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648946840521565, + "loss": 3.1475, + "theoretical_loss": 4.21279644002574, + "tokens_seen": 262406144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046488465396188566, + "loss": 3.3946, + "theoretical_loss": 4.212670313470209, + "tokens_seen": 262471680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046487462387161485, + "loss": 3.207, + "theoretical_loss": 4.212544227218347, + "tokens_seen": 262537216 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464864593781344, + "loss": 3.3328, + "theoretical_loss": 4.21241818124722, + "tokens_seen": 262602752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648545636910732, + "loss": 3.2868, + "theoretical_loss": 4.212292175533912, + "tokens_seen": 262668288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046484453360080244, + "loss": 3.244, + "theoretical_loss": 4.212166210055526, + "tokens_seen": 262733824 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046483450351053157, + "loss": 3.2596, + "theoretical_loss": 4.212040284789181, + "tokens_seen": 262799360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004648244734202608, + "loss": 3.2781, + "theoretical_loss": 4.211914399712019, + "tokens_seen": 262864896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046481444332998993, + "loss": 3.2818, + "theoretical_loss": 4.211788554801198, + "tokens_seen": 262930432 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046480441323971917, + "loss": 3.1677, + "theoretical_loss": 4.211662750033895, + "tokens_seen": 262995968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046479438314944835, + "loss": 3.3911, + "theoretical_loss": 4.211536985387307, + "tokens_seen": 263061504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046478435305917753, + "loss": 3.303, + "theoretical_loss": 4.211411260838647, + "tokens_seen": 263127040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647743229689067, + "loss": 3.3061, + "theoretical_loss": 4.2112855763651496, + "tokens_seen": 263192576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046476429287863595, + "loss": 3.3423, + "theoretical_loss": 4.211159931944065, + "tokens_seen": 263258112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647542627883651, + "loss": 3.3091, + "theoretical_loss": 4.211034327552666, + "tokens_seen": 263323648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647442326980943, + "loss": 3.2807, + "theoretical_loss": 4.210908763168239, + "tokens_seen": 263389184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046473420260782344, + "loss": 3.2747, + "theoretical_loss": 4.210783238768093, + "tokens_seen": 263454720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046472417251755267, + "loss": 3.2455, + "theoretical_loss": 4.210657754329553, + "tokens_seen": 263520256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004647141424272819, + "loss": 3.1118, + "theoretical_loss": 4.210532309829965, + "tokens_seen": 263585792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046470411233701103, + "loss": 3.1963, + "theoretical_loss": 4.21040690524669, + "tokens_seen": 263651328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046469408224674027, + "loss": 3.2165, + "theoretical_loss": 4.21028154055711, + "tokens_seen": 263716864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 448341, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3521900177001953, + "objective/train/theoretical_loss": 4.2101562157386265, + "objective/train/tokens_used": 284242400, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646840521564694, + "loss": 3.2791, + "theoretical_loss": 4.2101562157386265, + "tokens_seen": 263782400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046467402206619863, + "loss": 3.1811, + "theoretical_loss": 4.210030930768655, + "tokens_seen": 263847936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646639919759278, + "loss": 3.3384, + "theoretical_loss": 4.2099056856246335, + "tokens_seen": 263913472 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464653961885657, + "loss": 3.259, + "theoretical_loss": 4.209780480284017, + "tokens_seen": 263979008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646439317953862, + "loss": 3.1449, + "theoretical_loss": 4.209655314724279, + "tokens_seen": 264044544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046463390170511535, + "loss": 3.3407, + "theoretical_loss": 4.209530188922911, + "tokens_seen": 264110080 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046462387161484454, + "loss": 3.2637, + "theoretical_loss": 4.209405102857422, + "tokens_seen": 264175616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046461384152457377, + "loss": 3.2297, + "theoretical_loss": 4.209280056505342, + "tokens_seen": 264241152 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004646038114343029, + "loss": 3.259, + "theoretical_loss": 4.209155049844217, + "tokens_seen": 264306688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046459378134403213, + "loss": 3.2319, + "theoretical_loss": 4.209030082851612, + "tokens_seen": 264372224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645837512537613, + "loss": 3.2291, + "theoretical_loss": 4.208905155505109, + "tokens_seen": 264437760 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645737211634905, + "loss": 3.3001, + "theoretical_loss": 4.20878026778231, + "tokens_seen": 264503296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645636910732197, + "loss": 3.2526, + "theoretical_loss": 4.208655419660834, + "tokens_seen": 264568832 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046455366098294886, + "loss": 3.1062, + "theoretical_loss": 4.208530611118321, + "tokens_seen": 264634368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046454363089267804, + "loss": 3.1764, + "theoretical_loss": 4.208405842132423, + "tokens_seen": 264699904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645336008024073, + "loss": 2.9908, + "theoretical_loss": 4.208281112680817, + "tokens_seen": 264765440 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004645235707121364, + "loss": 3.173, + "theoretical_loss": 4.208156422741195, + "tokens_seen": 264830976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046451354062186564, + "loss": 3.2522, + "theoretical_loss": 4.208031772291265, + "tokens_seen": 264896512 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046450351053159476, + "loss": 3.3833, + "theoretical_loss": 4.207907161308757, + "tokens_seen": 264962048 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464493480441324, + "loss": 3.3016, + "theoretical_loss": 4.2077825897714165, + "tokens_seen": 265027584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644834503510532, + "loss": 3.3126, + "theoretical_loss": 4.207658057657008, + "tokens_seen": 265093120 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046447342026078236, + "loss": 3.0745, + "theoretical_loss": 4.207533564943316, + "tokens_seen": 265158656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046446339017051154, + "loss": 3.2312, + "theoretical_loss": 4.207409111608138, + "tokens_seen": 265224192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644533600802407, + "loss": 3.1868, + "theoretical_loss": 4.2072846976292935, + "tokens_seen": 265289728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644433299899699, + "loss": 3.4107, + "theoretical_loss": 4.2071603229846195, + "tokens_seen": 265355264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 451154, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.682401418685913, + "objective/train/theoretical_loss": 4.20703598765197, + "objective/train/tokens_used": 285880800, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046443329989969914, + "loss": 3.5003, + "theoretical_loss": 4.20703598765197, + "tokens_seen": 265420800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046442326980942827, + "loss": 3.221, + "theoretical_loss": 4.206911691609217, + "tokens_seen": 265486336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644132397191575, + "loss": 3.3845, + "theoretical_loss": 4.206787434834251, + "tokens_seen": 265551872 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644032096288867, + "loss": 3.3041, + "theoretical_loss": 4.20666321730498, + "tokens_seen": 265617408 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046439317953861586, + "loss": 3.218, + "theoretical_loss": 4.206539038999329, + "tokens_seen": 265682944 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046438314944834505, + "loss": 3.1954, + "theoretical_loss": 4.206414899895244, + "tokens_seen": 265748480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643731193580742, + "loss": 3.3311, + "theoretical_loss": 4.206290799970685, + "tokens_seen": 265814016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004643630892678034, + "loss": 3.2099, + "theoretical_loss": 4.206166739203632, + "tokens_seen": 265879552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046435305917753264, + "loss": 3.3496, + "theoretical_loss": 4.206042717572082, + "tokens_seen": 265945088 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046434302908726177, + "loss": 3.1489, + "theoretical_loss": 4.20591873505405, + "tokens_seen": 266010624 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464332998996991, + "loss": 3.25, + "theoretical_loss": 4.20579479162757, + "tokens_seen": 266076160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046432296890672013, + "loss": 3.2905, + "theoretical_loss": 4.205670887270691, + "tokens_seen": 266141696 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046431293881644937, + "loss": 3.325, + "theoretical_loss": 4.205547021961482, + "tokens_seen": 266207232 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046430290872617855, + "loss": 3.1504, + "theoretical_loss": 4.205423195678029, + "tokens_seen": 266272768 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046429287863590773, + "loss": 3.3181, + "theoretical_loss": 4.205299408398435, + "tokens_seen": 266338304 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642828485456369, + "loss": 3.3075, + "theoretical_loss": 4.2051756601008226, + "tokens_seen": 266403840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046427281845536615, + "loss": 3.2238, + "theoretical_loss": 4.20505195076333, + "tokens_seen": 266469376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642627883650953, + "loss": 3.3286, + "theoretical_loss": 4.204928280364115, + "tokens_seen": 266534912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642527582748245, + "loss": 3.3207, + "theoretical_loss": 4.20480464888135, + "tokens_seen": 266600448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046424272818455364, + "loss": 3.4096, + "theoretical_loss": 4.204681056293228, + "tokens_seen": 266665984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046423269809428287, + "loss": 3.3301, + "theoretical_loss": 4.204557502577957, + "tokens_seen": 266731520 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046422266800401205, + "loss": 3.3231, + "theoretical_loss": 4.204433987713767, + "tokens_seen": 266797056 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046421263791374123, + "loss": 3.2258, + "theoretical_loss": 4.2043105116789, + "tokens_seen": 266862592 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004642026078234704, + "loss": 3.3148, + "theoretical_loss": 4.204187074451617, + "tokens_seen": 266928128 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641925777331996, + "loss": 3.2287, + "theoretical_loss": 4.204063676010202, + "tokens_seen": 266993664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 453942, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.323920249938965, + "objective/train/theoretical_loss": 4.203940316332948, + "objective/train/tokens_used": 287519200, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641825476429288, + "loss": 3.3774, + "theoretical_loss": 4.203940316332948, + "tokens_seen": 267059200 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464172517552658, + "loss": 3.2691, + "theoretical_loss": 4.203816995398171, + "tokens_seen": 267124736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046416248746238714, + "loss": 3.1999, + "theoretical_loss": 4.203693713184203, + "tokens_seen": 267190272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641524573721164, + "loss": 3.2937, + "theoretical_loss": 4.203570469669392, + "tokens_seen": 267255808 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641424272818455, + "loss": 3.5384, + "theoretical_loss": 4.203447264832107, + "tokens_seen": 267321344 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046413239719157474, + "loss": 3.2634, + "theoretical_loss": 4.203324098650731, + "tokens_seen": 267386880 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641223671013039, + "loss": 3.3685, + "theoretical_loss": 4.203200971103666, + "tokens_seen": 267452416 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641123370110331, + "loss": 3.2801, + "theoretical_loss": 4.20307788216933, + "tokens_seen": 267517952 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641023069207623, + "loss": 3.1901, + "theoretical_loss": 4.202954831826159, + "tokens_seen": 267583488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640922768304915, + "loss": 3.2512, + "theoretical_loss": 4.202831820052609, + "tokens_seen": 267649024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046408224674022064, + "loss": 3.2436, + "theoretical_loss": 4.202708846827148, + "tokens_seen": 267714560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640722166499499, + "loss": 3.2016, + "theoretical_loss": 4.202585912128266, + "tokens_seen": 267780096 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464062186559679, + "loss": 3.2147, + "theoretical_loss": 4.202463015934468, + "tokens_seen": 267845632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046405215646940824, + "loss": 3.3092, + "theoretical_loss": 4.202340158224277, + "tokens_seen": 267911168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640421263791374, + "loss": 3.4041, + "theoretical_loss": 4.202217338976231, + "tokens_seen": 267976704 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640320962888666, + "loss": 3.1784, + "theoretical_loss": 4.2020945581688895, + "tokens_seen": 268042240 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004640220661985958, + "loss": 3.2712, + "theoretical_loss": 4.201971815780826, + "tokens_seen": 268107776 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046401203610832496, + "loss": 3.2498, + "theoretical_loss": 4.201849111790631, + "tokens_seen": 268173312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046400200601805414, + "loss": 3.2644, + "theoretical_loss": 4.201726446176915, + "tokens_seen": 268238848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639919759277834, + "loss": 3.0872, + "theoretical_loss": 4.201603818918302, + "tokens_seen": 268304384 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639819458375125, + "loss": 3.4422, + "theoretical_loss": 4.201481229993435, + "tokens_seen": 268369920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046397191574724174, + "loss": 3.3044, + "theoretical_loss": 4.201358679380976, + "tokens_seen": 268435456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639618856569709, + "loss": 3.2938, + "theoretical_loss": 4.201236167059601, + "tokens_seen": 268500992 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639518555667001, + "loss": 3.1589, + "theoretical_loss": 4.201113693008002, + "tokens_seen": 268566528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046394182547642934, + "loss": 3.3791, + "theoretical_loss": 4.200991257204894, + "tokens_seen": 268632064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 456515, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.509286642074585, + "objective/train/theoretical_loss": 4.2008688596290025, + "objective/train/tokens_used": 289157600, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046393179538615847, + "loss": 3.1183, + "theoretical_loss": 4.2008688596290025, + "tokens_seen": 268697600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639217652958877, + "loss": 3.3757, + "theoretical_loss": 4.200746500259073, + "tokens_seen": 268763136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639117352056169, + "loss": 3.2319, + "theoretical_loss": 4.200624179073869, + "tokens_seen": 268828672 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046390170511534606, + "loss": 3.3695, + "theoretical_loss": 4.2005018960521685, + "tokens_seen": 268894208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046389167502507525, + "loss": 3.2186, + "theoretical_loss": 4.200379651172769, + "tokens_seen": 268959744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638816449348044, + "loss": 3.1319, + "theoretical_loss": 4.200257444414483, + "tokens_seen": 269025280 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638716148445336, + "loss": 3.369, + "theoretical_loss": 4.200135275756139, + "tokens_seen": 269090816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046386158475426284, + "loss": 3.3535, + "theoretical_loss": 4.200013145176587, + "tokens_seen": 269156352 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046385155466399197, + "loss": 3.2764, + "theoretical_loss": 4.199891052654689, + "tokens_seen": 269221888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004638415245737212, + "loss": 3.2407, + "theoretical_loss": 4.199768998169326, + "tokens_seen": 269287424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046383149448345033, + "loss": 3.2556, + "theoretical_loss": 4.199646981699395, + "tokens_seen": 269352960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046382146439317957, + "loss": 3.3283, + "theoretical_loss": 4.199525003223812, + "tokens_seen": 269418496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046381143430290875, + "loss": 3.2216, + "theoretical_loss": 4.199403062721506, + "tokens_seen": 269484032 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046380140421263793, + "loss": 3.2721, + "theoretical_loss": 4.199281160171427, + "tokens_seen": 269549568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637913741223671, + "loss": 3.366, + "theoretical_loss": 4.1991592955525405, + "tokens_seen": 269615104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046378134403209635, + "loss": 3.2108, + "theoretical_loss": 4.199037468843825, + "tokens_seen": 269680640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637713139418255, + "loss": 3.3268, + "theoretical_loss": 4.198915680024282, + "tokens_seen": 269746176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637612838515547, + "loss": 3.4147, + "theoretical_loss": 4.198793929072925, + "tokens_seen": 269811712 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046375125376128384, + "loss": 3.3562, + "theoretical_loss": 4.198672215968785, + "tokens_seen": 269877248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046374122367101307, + "loss": 3.3748, + "theoretical_loss": 4.198550540690912, + "tokens_seen": 269942784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046373119358074225, + "loss": 3.2943, + "theoretical_loss": 4.198428903218371, + "tokens_seen": 270008320 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046372116349047143, + "loss": 3.2792, + "theoretical_loss": 4.198307303530243, + "tokens_seen": 270073856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637111334002006, + "loss": 3.3353, + "theoretical_loss": 4.198185741605628, + "tokens_seen": 270139392 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004637011033099298, + "loss": 3.2839, + "theoretical_loss": 4.19806421742364, + "tokens_seen": 270204928 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463691073219659, + "loss": 2.9842, + "theoretical_loss": 4.197942730963412, + "tokens_seen": 270270464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 459334, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1412386894226074, + "objective/train/theoretical_loss": 4.19782128220409, + "objective/train/tokens_used": 290796000, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636810431293882, + "loss": 3.3422, + "theoretical_loss": 4.19782128220409, + "tokens_seen": 270336000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046367101303911734, + "loss": 3.3724, + "theoretical_loss": 4.19769987112484, + "tokens_seen": 270401536 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636609829488466, + "loss": 3.1869, + "theoretical_loss": 4.1975784977048445, + "tokens_seen": 270467072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636509528585757, + "loss": 3.2433, + "theoretical_loss": 4.1974571619233, + "tokens_seen": 270532608 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046364092276830494, + "loss": 3.2574, + "theoretical_loss": 4.197335863759422, + "tokens_seen": 270598144 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636308926780341, + "loss": 3.2291, + "theoretical_loss": 4.1972146031924416, + "tokens_seen": 270663680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636208625877633, + "loss": 3.3036, + "theoretical_loss": 4.197093380201606, + "tokens_seen": 270729216 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636108324974925, + "loss": 3.2416, + "theoretical_loss": 4.196972194766179, + "tokens_seen": 270794752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636008024072217, + "loss": 3.3301, + "theoretical_loss": 4.196851046865442, + "tokens_seen": 270860288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046359077231695084, + "loss": 3.326, + "theoretical_loss": 4.1967299364786905, + "tokens_seen": 270925824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635807422266801, + "loss": 3.284, + "theoretical_loss": 4.196608863585239, + "tokens_seen": 270991360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635707121364092, + "loss": 3.3136, + "theoretical_loss": 4.1964878281644165, + "tokens_seen": 271056896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046356068204613844, + "loss": 3.3413, + "theoretical_loss": 4.19636683019557, + "tokens_seen": 271122432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635506519558676, + "loss": 3.2119, + "theoretical_loss": 4.196245869658061, + "tokens_seen": 271187968 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635406218655968, + "loss": 3.3129, + "theoretical_loss": 4.1961249465312696, + "tokens_seen": 271253504 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463530591775326, + "loss": 3.4302, + "theoretical_loss": 4.196004060794589, + "tokens_seen": 271319040 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046352056168505516, + "loss": 3.1929, + "theoretical_loss": 4.195883212427433, + "tokens_seen": 271384576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046351053159478434, + "loss": 3.2625, + "theoretical_loss": 4.195762401409229, + "tokens_seen": 271450112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635005015045136, + "loss": 3.3358, + "theoretical_loss": 4.19564162771942, + "tokens_seen": 271515648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634904714142427, + "loss": 3.3039, + "theoretical_loss": 4.195520891337466, + "tokens_seen": 271581184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046348044132397194, + "loss": 3.2115, + "theoretical_loss": 4.195400192242845, + "tokens_seen": 271646720 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046347041123370107, + "loss": 3.2505, + "theoretical_loss": 4.19527953041505, + "tokens_seen": 271712256 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634603811434303, + "loss": 3.2679, + "theoretical_loss": 4.19515890583359, + "tokens_seen": 271777792 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634503510531595, + "loss": 3.2119, + "theoretical_loss": 4.195038318477989, + "tokens_seen": 271843328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046344032096288867, + "loss": 3.3972, + "theoretical_loss": 4.194917768327789, + "tokens_seen": 271908864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 462056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4388773441314697, + "objective/train/theoretical_loss": 4.194797255362549, + "objective/train/tokens_used": 292434400, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046343029087261785, + "loss": 3.2391, + "theoretical_loss": 4.194797255362549, + "tokens_seen": 271974400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634202607823471, + "loss": 3.1617, + "theoretical_loss": 4.194676779561841, + "tokens_seen": 272039936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004634102306920762, + "loss": 3.1768, + "theoretical_loss": 4.194556340905256, + "tokens_seen": 272105472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046340020060180545, + "loss": 3.2419, + "theoretical_loss": 4.194435939372401, + "tokens_seen": 272171008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046339017051153457, + "loss": 3.3242, + "theoretical_loss": 4.194315574942896, + "tokens_seen": 272236544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633801404212638, + "loss": 3.254, + "theoretical_loss": 4.194195247596381, + "tokens_seen": 272302080 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463370110330993, + "loss": 3.1144, + "theoretical_loss": 4.19407495731251, + "tokens_seen": 272367616 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046336008024072217, + "loss": 3.285, + "theoretical_loss": 4.193954704070952, + "tokens_seen": 272433152 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046335005015045135, + "loss": 3.3314, + "theoretical_loss": 4.193834487851396, + "tokens_seen": 272498688 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046334002006018053, + "loss": 3.1544, + "theoretical_loss": 4.193714308633542, + "tokens_seen": 272564224 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633299899699097, + "loss": 3.1403, + "theoretical_loss": 4.1935941663971095, + "tokens_seen": 272629760 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046331995987963895, + "loss": 3.3156, + "theoretical_loss": 4.193474061121833, + "tokens_seen": 272695296 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633099297893681, + "loss": 3.1741, + "theoretical_loss": 4.193353992787463, + "tokens_seen": 272760832 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632998996990973, + "loss": 3.1634, + "theoretical_loss": 4.193233961373766, + "tokens_seen": 272826368 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046328986960882644, + "loss": 3.1913, + "theoretical_loss": 4.1931139668605235, + "tokens_seen": 272891904 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632798395185557, + "loss": 3.189, + "theoretical_loss": 4.192994009227535, + "tokens_seen": 272957440 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046326980942828485, + "loss": 3.301, + "theoretical_loss": 4.192874088454613, + "tokens_seen": 273022976 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046325977933801404, + "loss": 3.3089, + "theoretical_loss": 4.19275420452159, + "tokens_seen": 273088512 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632497492477432, + "loss": 3.1983, + "theoretical_loss": 4.192634357408309, + "tokens_seen": 273154048 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046323971915747245, + "loss": 3.3217, + "theoretical_loss": 4.192514547094634, + "tokens_seen": 273219584 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632296890672016, + "loss": 3.2378, + "theoretical_loss": 4.192394773560441, + "tokens_seen": 273285120 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004632196589769308, + "loss": 3.3763, + "theoretical_loss": 4.192275036785625, + "tokens_seen": 273350656 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046320962888666, + "loss": 3.3252, + "theoretical_loss": 4.192155336750094, + "tokens_seen": 273416192 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631995987963892, + "loss": 3.13, + "theoretical_loss": 4.192035673433773, + "tokens_seen": 273481728 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631895687061184, + "loss": 3.1406, + "theoretical_loss": 4.191916046816605, + "tokens_seen": 273547264 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 465025, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5158631801605225, + "objective/train/theoretical_loss": 4.191796456878544, + "objective/train/tokens_used": 294072800, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046317953861584754, + "loss": 3.3463, + "theoretical_loss": 4.191796456878544, + "tokens_seen": 273612800 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631695085255768, + "loss": 3.2351, + "theoretical_loss": 4.191676903599563, + "tokens_seen": 273678336 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631594784353059, + "loss": 3.4088, + "theoretical_loss": 4.191557386959651, + "tokens_seen": 273743872 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046314944834503514, + "loss": 3.2924, + "theoretical_loss": 4.191437906938811, + "tokens_seen": 273809408 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631394182547643, + "loss": 3.0649, + "theoretical_loss": 4.191318463517062, + "tokens_seen": 273874944 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631293881644935, + "loss": 3.2478, + "theoretical_loss": 4.19119905667444, + "tokens_seen": 273940480 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631193580742227, + "loss": 3.3255, + "theoretical_loss": 4.191079686390996, + "tokens_seen": 274006016 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004631093279839519, + "loss": 3.1485, + "theoretical_loss": 4.190960352646796, + "tokens_seen": 274071552 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046309929789368104, + "loss": 3.0913, + "theoretical_loss": 4.190841055421921, + "tokens_seen": 274137088 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630892678034103, + "loss": 3.3214, + "theoretical_loss": 4.19072179469647, + "tokens_seen": 274202624 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630792377131394, + "loss": 3.1708, + "theoretical_loss": 4.190602570450556, + "tokens_seen": 274268160 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046306920762286864, + "loss": 3.2807, + "theoretical_loss": 4.190483382664308, + "tokens_seen": 274333696 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630591775325978, + "loss": 3.1466, + "theoretical_loss": 4.19036423131787, + "tokens_seen": 274399232 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463049147442327, + "loss": 3.3709, + "theoretical_loss": 4.190245116391403, + "tokens_seen": 274464768 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630391173520562, + "loss": 3.2199, + "theoretical_loss": 4.190126037865082, + "tokens_seen": 274530304 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046302908726178536, + "loss": 3.2297, + "theoretical_loss": 4.190006995719098, + "tokens_seen": 274595840 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046301905717151455, + "loss": 3.3752, + "theoretical_loss": 4.1898879899336565, + "tokens_seen": 274661376 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004630090270812438, + "loss": 3.255, + "theoretical_loss": 4.189769020488981, + "tokens_seen": 274726912 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629989969909729, + "loss": 3.2247, + "theoretical_loss": 4.189650087365309, + "tokens_seen": 274792448 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046298896690070214, + "loss": 3.4144, + "theoretical_loss": 4.189531190542893, + "tokens_seen": 274857984 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046297893681043127, + "loss": 3.3542, + "theoretical_loss": 4.189412330002001, + "tokens_seen": 274923520 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629689067201605, + "loss": 3.331, + "theoretical_loss": 4.189293505722918, + "tokens_seen": 274989056 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629588766298897, + "loss": 3.0523, + "theoretical_loss": 4.189174717685942, + "tokens_seen": 275054592 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046294884653961887, + "loss": 3.2162, + "theoretical_loss": 4.189055965871389, + "tokens_seen": 275120128 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046293881644934805, + "loss": 3.3026, + "theoretical_loss": 4.188937250259587, + "tokens_seen": 275185664 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 466444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2781283855438232, + "objective/train/theoretical_loss": 4.188818570830883, + "objective/train/tokens_used": 295711200, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629287863590773, + "loss": 3.1923, + "theoretical_loss": 4.188818570830883, + "tokens_seen": 275251200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004629187562688064, + "loss": 3.4042, + "theoretical_loss": 4.188699927565638, + "tokens_seen": 275316736 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046290872617853565, + "loss": 3.2788, + "theoretical_loss": 4.188581320444228, + "tokens_seen": 275382272 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046289869608826477, + "loss": 3.3113, + "theoretical_loss": 4.1884627494470426, + "tokens_seen": 275447808 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462888665997994, + "loss": 3.3402, + "theoretical_loss": 4.1883442145544905, + "tokens_seen": 275513344 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628786359077232, + "loss": 3.3814, + "theoretical_loss": 4.188225715746992, + "tokens_seen": 275578880 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046286860581745237, + "loss": 3.2007, + "theoretical_loss": 4.188107253004986, + "tokens_seen": 275644416 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046285857572718155, + "loss": 3.2649, + "theoretical_loss": 4.187988826308925, + "tokens_seen": 275709952 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046284854563691073, + "loss": 3.073, + "theoretical_loss": 4.187870435639275, + "tokens_seen": 275775488 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628385155466399, + "loss": 2.9523, + "theoretical_loss": 4.18775208097652, + "tokens_seen": 275841024 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046282848545636915, + "loss": 3.2049, + "theoretical_loss": 4.187633762301159, + "tokens_seen": 275906560 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628184553660983, + "loss": 3.2977, + "theoretical_loss": 4.187515479593704, + "tokens_seen": 275972096 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004628084252758275, + "loss": 3.2228, + "theoretical_loss": 4.187397232834683, + "tokens_seen": 276037632 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046279839518555664, + "loss": 3.2648, + "theoretical_loss": 4.187279022004642, + "tokens_seen": 276103168 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627883650952859, + "loss": 3.2005, + "theoretical_loss": 4.1871608470841375, + "tokens_seen": 276168704 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046277833500501505, + "loss": 3.2694, + "theoretical_loss": 4.1870427080537445, + "tokens_seen": 276234240 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046276830491474424, + "loss": 3.3122, + "theoretical_loss": 4.1869246048940525, + "tokens_seen": 276299776 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627582748244734, + "loss": 3.2794, + "theoretical_loss": 4.186806537585666, + "tokens_seen": 276365312 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046274824473420265, + "loss": 3.2367, + "theoretical_loss": 4.186688506109202, + "tokens_seen": 276430848 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627382146439318, + "loss": 3.1319, + "theoretical_loss": 4.186570510445296, + "tokens_seen": 276496384 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462728184553661, + "loss": 3.2454, + "theoretical_loss": 4.186452550574599, + "tokens_seen": 276561920 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046271815446339014, + "loss": 3.273, + "theoretical_loss": 4.186334626477774, + "tokens_seen": 276627456 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627081243731194, + "loss": 3.2684, + "theoretical_loss": 4.186216738135501, + "tokens_seen": 276692992 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046269809428284856, + "loss": 3.2145, + "theoretical_loss": 4.186098885528473, + "tokens_seen": 276758528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046268806419257774, + "loss": 3.1902, + "theoretical_loss": 4.185981068637401, + "tokens_seen": 276824064 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 468697, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1106276512145996, + "objective/train/theoretical_loss": 4.185863287443008, + "objective/train/tokens_used": 297349600, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626780341023069, + "loss": 3.2484, + "theoretical_loss": 4.185863287443008, + "tokens_seen": 276889600 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626680040120361, + "loss": 3.1678, + "theoretical_loss": 4.185745541926035, + "tokens_seen": 276955136 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626579739217653, + "loss": 3.1812, + "theoretical_loss": 4.185627832067237, + "tokens_seen": 277020672 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626479438314945, + "loss": 3.2644, + "theoretical_loss": 4.1855101578473795, + "tokens_seen": 277086208 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046263791374122364, + "loss": 3.1643, + "theoretical_loss": 4.18539251924725, + "tokens_seen": 277151744 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626278836509529, + "loss": 3.189, + "theoretical_loss": 4.185274916247646, + "tokens_seen": 277217280 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462617853560682, + "loss": 3.2604, + "theoretical_loss": 4.185157348829383, + "tokens_seen": 277282816 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046260782347041124, + "loss": 3.0528, + "theoretical_loss": 4.185039816973289, + "tokens_seen": 277348352 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625977933801404, + "loss": 3.2857, + "theoretical_loss": 4.184922320660207, + "tokens_seen": 277413888 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625877632898696, + "loss": 3.3717, + "theoretical_loss": 4.184804859870997, + "tokens_seen": 277479424 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625777331995988, + "loss": 3.186, + "theoretical_loss": 4.184687434586531, + "tokens_seen": 277544960 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462567703109328, + "loss": 3.2449, + "theoretical_loss": 4.184570044787698, + "tokens_seen": 277610496 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046255767301905715, + "loss": 3.3232, + "theoretical_loss": 4.1844526904554, + "tokens_seen": 277676032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625476429287864, + "loss": 3.1228, + "theoretical_loss": 4.184335371570556, + "tokens_seen": 277741568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625376128385155, + "loss": 3.2354, + "theoretical_loss": 4.184218088114097, + "tokens_seen": 277807104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046252758274824475, + "loss": 3.288, + "theoretical_loss": 4.1841008400669715, + "tokens_seen": 277872640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625175526579739, + "loss": 3.3079, + "theoretical_loss": 4.183983627410142, + "tokens_seen": 277938176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625075225677031, + "loss": 3.0379, + "theoretical_loss": 4.183866450124584, + "tokens_seen": 278003712 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624974924774323, + "loss": 3.2248, + "theoretical_loss": 4.18374930819129, + "tokens_seen": 278069248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046248746238716147, + "loss": 3.2118, + "theoretical_loss": 4.183632201591264, + "tokens_seen": 278134784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046247743229689065, + "loss": 3.2803, + "theoretical_loss": 4.18351513030553, + "tokens_seen": 278200320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624674022066199, + "loss": 3.0813, + "theoretical_loss": 4.1833980943151206, + "tokens_seen": 278265856 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046245737211634907, + "loss": 3.2069, + "theoretical_loss": 4.183281093601087, + "tokens_seen": 278331392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046244734202607825, + "loss": 3.3288, + "theoretical_loss": 4.183164128144495, + "tokens_seen": 278396928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624373119358075, + "loss": 3.3337, + "theoretical_loss": 4.183047197926422, + "tokens_seen": 278462464 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 471337, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9572341442108154, + "objective/train/theoretical_loss": 4.182930302927963, + "objective/train/tokens_used": 298988000, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004624272818455366, + "loss": 3.2296, + "theoretical_loss": 4.182930302927963, + "tokens_seen": 278528000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046241725175526585, + "loss": 3.1991, + "theoretical_loss": 4.182813443130227, + "tokens_seen": 278593536 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462407221664995, + "loss": 3.296, + "theoretical_loss": 4.182696618514337, + "tokens_seen": 278659072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623971915747242, + "loss": 3.1124, + "theoretical_loss": 4.18257982906143, + "tokens_seen": 278724608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623871614844534, + "loss": 3.2479, + "theoretical_loss": 4.1824630747526585, + "tokens_seen": 278790144 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046237713139418257, + "loss": 3.2902, + "theoretical_loss": 4.182346355569189, + "tokens_seen": 278855680 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046236710130391175, + "loss": 3.3323, + "theoretical_loss": 4.182229671492204, + "tokens_seen": 278921216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046235707121364093, + "loss": 3.2005, + "theoretical_loss": 4.1821130225028975, + "tokens_seen": 278986752 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623470411233701, + "loss": 3.3921, + "theoretical_loss": 4.1819964085824815, + "tokens_seen": 279052288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046233701103309935, + "loss": 3.3107, + "theoretical_loss": 4.181879829712178, + "tokens_seen": 279117824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623269809428285, + "loss": 3.1956, + "theoretical_loss": 4.181763285873231, + "tokens_seen": 279183360 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623169508525577, + "loss": 3.3029, + "theoretical_loss": 4.181646777046889, + "tokens_seen": 279248896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046230692076228684, + "loss": 3.1833, + "theoretical_loss": 4.181530303214423, + "tokens_seen": 279314432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622968906720161, + "loss": 3.4596, + "theoretical_loss": 4.181413864357115, + "tokens_seen": 279379968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046228686058174525, + "loss": 3.0404, + "theoretical_loss": 4.181297460456262, + "tokens_seen": 279445504 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046227683049147444, + "loss": 3.304, + "theoretical_loss": 4.181181091493174, + "tokens_seen": 279511040 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622668004012036, + "loss": 3.3912, + "theoretical_loss": 4.181064757449178, + "tokens_seen": 279576576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046225677031093285, + "loss": 3.2555, + "theoretical_loss": 4.180948458305615, + "tokens_seen": 279642112 + }, + { + "epoch": 0.08, + "learning_rate": 0.000462246740220662, + "loss": 3.2438, + "theoretical_loss": 4.180832194043836, + "tokens_seen": 279707648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622367101303912, + "loss": 3.3578, + "theoretical_loss": 4.180715964645213, + "tokens_seen": 279773184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046222668004012034, + "loss": 3.3404, + "theoretical_loss": 4.180599770091126, + "tokens_seen": 279838720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622166499498496, + "loss": 3.2915, + "theoretical_loss": 4.180483610362975, + "tokens_seen": 279904256 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046220661985957876, + "loss": 3.3809, + "theoretical_loss": 4.18036748544217, + "tokens_seen": 279969792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046219658976930794, + "loss": 3.3033, + "theoretical_loss": 4.180251395310137, + "tokens_seen": 280035328 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621865596790371, + "loss": 3.1155, + "theoretical_loss": 4.1801353399483165, + "tokens_seen": 280100864 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 474317, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3007760047912598, + "objective/train/theoretical_loss": 4.180019319338163, + "objective/train/tokens_used": 300626400, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621765295887663, + "loss": 3.3376, + "theoretical_loss": 4.180019319338163, + "tokens_seen": 280166400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621664994984955, + "loss": 3.306, + "theoretical_loss": 4.179903333461144, + "tokens_seen": 280231936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621564694082247, + "loss": 3.1993, + "theoretical_loss": 4.179787382298744, + "tokens_seen": 280297472 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046214643931795384, + "loss": 3.1277, + "theoretical_loss": 4.179671465832458, + "tokens_seen": 280363008 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621364092276831, + "loss": 3.2343, + "theoretical_loss": 4.179555584043799, + "tokens_seen": 280428544 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004621263791374122, + "loss": 2.975, + "theoretical_loss": 4.17943973691429, + "tokens_seen": 280494080 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046211634904714144, + "loss": 3.2278, + "theoretical_loss": 4.179323924425472, + "tokens_seen": 280559616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004621063189568706, + "loss": 3.2567, + "theoretical_loss": 4.179208146558899, + "tokens_seen": 280625152 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620962888665998, + "loss": 3.2388, + "theoretical_loss": 4.1790924032961385, + "tokens_seen": 280690688 + }, + { + "epoch": 0.09, + "learning_rate": 0.000462086258776329, + "loss": 3.1798, + "theoretical_loss": 4.178976694618772, + "tokens_seen": 280756224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620762286860582, + "loss": 3.1971, + "theoretical_loss": 4.178861020508395, + "tokens_seen": 280821760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046206619859578735, + "loss": 3.2644, + "theoretical_loss": 4.178745380946619, + "tokens_seen": 280887296 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620561685055166, + "loss": 3.1728, + "theoretical_loss": 4.178629775915066, + "tokens_seen": 280952832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620461384152457, + "loss": 3.3459, + "theoretical_loss": 4.178514205395376, + "tokens_seen": 281018368 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046203610832497495, + "loss": 3.312, + "theoretical_loss": 4.178398669369201, + "tokens_seen": 281083904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620260782347041, + "loss": 3.306, + "theoretical_loss": 4.178283167818206, + "tokens_seen": 281149440 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620160481444333, + "loss": 3.2606, + "theoretical_loss": 4.178167700724073, + "tokens_seen": 281214976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004620060180541625, + "loss": 3.1218, + "theoretical_loss": 4.178052268068494, + "tokens_seen": 281280512 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046199598796389167, + "loss": 3.1783, + "theoretical_loss": 4.177936869833179, + "tokens_seen": 281346048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046198595787362085, + "loss": 3.1821, + "theoretical_loss": 4.17782150599985, + "tokens_seen": 281411584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619759277833501, + "loss": 3.277, + "theoretical_loss": 4.1777061765502435, + "tokens_seen": 281477120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619658976930792, + "loss": 3.181, + "theoretical_loss": 4.1775908814661085, + "tokens_seen": 281542656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046195586760280845, + "loss": 3.1402, + "theoretical_loss": 4.17747562072921, + "tokens_seen": 281608192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046194583751253763, + "loss": 3.319, + "theoretical_loss": 4.177360394321325, + "tokens_seen": 281673728 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619358074222668, + "loss": 3.2174, + "theoretical_loss": 4.177245202224246, + "tokens_seen": 281739264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 477057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3766489028930664, + "objective/train/theoretical_loss": 4.17713004441978, + "objective/train/tokens_used": 302264800, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461925777331996, + "loss": 3.1165, + "theoretical_loss": 4.17713004441978, + "tokens_seen": 281804800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619157472417252, + "loss": 3.0603, + "theoretical_loss": 4.177014920889745, + "tokens_seen": 281870336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046190571715145435, + "loss": 3.3931, + "theoretical_loss": 4.176899831615974, + "tokens_seen": 281935872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618956870611836, + "loss": 3.3187, + "theoretical_loss": 4.176784776580316, + "tokens_seen": 282001408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618856569709127, + "loss": 2.9638, + "theoretical_loss": 4.176669755764632, + "tokens_seen": 282066944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046187562688064195, + "loss": 3.1485, + "theoretical_loss": 4.176554769150796, + "tokens_seen": 282132480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618655967903711, + "loss": 3.3024, + "theoretical_loss": 4.176439816720697, + "tokens_seen": 282198016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618555667001003, + "loss": 3.3095, + "theoretical_loss": 4.1763248984562376, + "tokens_seen": 282263552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618455366098295, + "loss": 3.2518, + "theoretical_loss": 4.176210014339335, + "tokens_seen": 282329088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618355065195587, + "loss": 3.2887, + "theoretical_loss": 4.17609516435192, + "tokens_seen": 282394624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046182547642928786, + "loss": 3.2044, + "theoretical_loss": 4.1759803484759335, + "tokens_seen": 282460160 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046181544633901704, + "loss": 3.3224, + "theoretical_loss": 4.175865566693336, + "tokens_seen": 282525696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004618054162487462, + "loss": 3.2691, + "theoretical_loss": 4.175750818986098, + "tokens_seen": 282591232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046179538615847545, + "loss": 3.2076, + "theoretical_loss": 4.1756361053362046, + "tokens_seen": 282656768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617853560682046, + "loss": 3.2888, + "theoretical_loss": 4.1755214257256545, + "tokens_seen": 282722304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617753259779338, + "loss": 2.9395, + "theoretical_loss": 4.17540678013646, + "tokens_seen": 282787840 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461765295887663, + "loss": 3.2612, + "theoretical_loss": 4.175292168550648, + "tokens_seen": 282853376 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617552657973922, + "loss": 3.2667, + "theoretical_loss": 4.175177590950257, + "tokens_seen": 282918912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046174523570712136, + "loss": 3.3709, + "theoretical_loss": 4.175063047317342, + "tokens_seen": 282984448 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046173520561685054, + "loss": 3.1792, + "theoretical_loss": 4.174948537633968, + "tokens_seen": 283049984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004617251755265797, + "loss": 3.3762, + "theoretical_loss": 4.174834061882218, + "tokens_seen": 283115520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046171514543630896, + "loss": 3.0439, + "theoretical_loss": 4.1747196200441845, + "tokens_seen": 283181056 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046170511534603814, + "loss": 3.1284, + "theoretical_loss": 4.174605212101977, + "tokens_seen": 283246592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616950852557673, + "loss": 3.328, + "theoretical_loss": 4.174490838037716, + "tokens_seen": 283312128 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616850551654965, + "loss": 3.277, + "theoretical_loss": 4.174376497833537, + "tokens_seen": 283377664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 479763, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.978116750717163, + "objective/train/theoretical_loss": 4.174262191471587, + "objective/train/tokens_used": 303903200, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616750250752257, + "loss": 3.2907, + "theoretical_loss": 4.174262191471587, + "tokens_seen": 283443200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616649949849549, + "loss": 3.1322, + "theoretical_loss": 4.17414791893403, + "tokens_seen": 283508736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046165496489468404, + "loss": 3.1224, + "theoretical_loss": 4.17403368020304, + "tokens_seen": 283574272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616449348044133, + "loss": 3.3528, + "theoretical_loss": 4.173919475260808, + "tokens_seen": 283639808 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616349047141424, + "loss": 3.3345, + "theoretical_loss": 4.173805304089536, + "tokens_seen": 283705344 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046162487462387164, + "loss": 3.3036, + "theoretical_loss": 4.173691166671439, + "tokens_seen": 283770880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004616148445336008, + "loss": 3.2598, + "theoretical_loss": 4.173577062988748, + "tokens_seen": 283836416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046160481444333, + "loss": 3.1858, + "theoretical_loss": 4.173462993023706, + "tokens_seen": 283901952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615947843530592, + "loss": 3.0945, + "theoretical_loss": 4.173348956758568, + "tokens_seen": 283967488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615847542627884, + "loss": 3.2817, + "theoretical_loss": 4.173234954175605, + "tokens_seen": 284033024 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046157472417251755, + "loss": 3.3186, + "theoretical_loss": 4.173120985257102, + "tokens_seen": 284098560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615646940822468, + "loss": 3.2749, + "theoretical_loss": 4.173007049985352, + "tokens_seen": 284164096 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615546639919759, + "loss": 3.2771, + "theoretical_loss": 4.172893148342667, + "tokens_seen": 284229632 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046154463390170515, + "loss": 3.324, + "theoretical_loss": 4.172779280311372, + "tokens_seen": 284295168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615346038114343, + "loss": 3.4278, + "theoretical_loss": 4.172665445873801, + "tokens_seen": 284360704 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615245737211635, + "loss": 3.2577, + "theoretical_loss": 4.172551645012307, + "tokens_seen": 284426240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004615145436308927, + "loss": 3.2303, + "theoretical_loss": 4.1724378777092515, + "tokens_seen": 284491776 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046150451354062187, + "loss": 3.1206, + "theoretical_loss": 4.172324143947012, + "tokens_seen": 284557312 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046149448345035105, + "loss": 3.48, + "theoretical_loss": 4.172210443707979, + "tokens_seen": 284622848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614844533600803, + "loss": 3.1807, + "theoretical_loss": 4.1720967769745565, + "tokens_seen": 284688384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614744232698094, + "loss": 3.0846, + "theoretical_loss": 4.171983143729159, + "tokens_seen": 284753920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046146439317953865, + "loss": 3.2445, + "theoretical_loss": 4.1718695439542195, + "tokens_seen": 284819456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046145436308926783, + "loss": 3.2818, + "theoretical_loss": 4.17175597763218, + "tokens_seen": 284884992 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461444332998997, + "loss": 3.2433, + "theoretical_loss": 4.171642444745497, + "tokens_seen": 284950528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614343029087262, + "loss": 3.0347, + "theoretical_loss": 4.1715289452766395, + "tokens_seen": 285016064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 482575, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3007097244262695, + "objective/train/theoretical_loss": 4.1714154792080915, + "objective/train/tokens_used": 305541600, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614242728184554, + "loss": 3.1694, + "theoretical_loss": 4.1714154792080915, + "tokens_seen": 285081600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046141424272818455, + "loss": 3.397, + "theoretical_loss": 4.171302046522349, + "tokens_seen": 285147136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004614042126379138, + "loss": 3.2574, + "theoretical_loss": 4.171188647201921, + "tokens_seen": 285212672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613941825476429, + "loss": 3.2785, + "theoretical_loss": 4.1710752812293315, + "tokens_seen": 285278208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046138415245737215, + "loss": 2.8969, + "theoretical_loss": 4.170961948587115, + "tokens_seen": 285343744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613741223671013, + "loss": 3.3526, + "theoretical_loss": 4.17084864925782, + "tokens_seen": 285409280 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613640922768305, + "loss": 3.2864, + "theoretical_loss": 4.1707353832240095, + "tokens_seen": 285474816 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613540621865597, + "loss": 3.2382, + "theoretical_loss": 4.170622150468258, + "tokens_seen": 285540352 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613440320962889, + "loss": 3.2662, + "theoretical_loss": 4.170508950973154, + "tokens_seen": 285605888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046133400200601806, + "loss": 3.1702, + "theoretical_loss": 4.1703957847213, + "tokens_seen": 285671424 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046132397191574724, + "loss": 3.0162, + "theoretical_loss": 4.170282651695308, + "tokens_seen": 285736960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613139418254764, + "loss": 3.0852, + "theoretical_loss": 4.170169551877808, + "tokens_seen": 285802496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046130391173520566, + "loss": 3.3054, + "theoretical_loss": 4.170056485251439, + "tokens_seen": 285868032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612938816449348, + "loss": 3.2715, + "theoretical_loss": 4.169943451798856, + "tokens_seen": 285933568 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461283851554664, + "loss": 3.1436, + "theoretical_loss": 4.169830451502724, + "tokens_seen": 285999104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612738214643932, + "loss": 3.3686, + "theoretical_loss": 4.169717484345725, + "tokens_seen": 286064640 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612637913741224, + "loss": 3.4337, + "theoretical_loss": 4.1696045503105506, + "tokens_seen": 286130176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046125376128385156, + "loss": 3.229, + "theoretical_loss": 4.169491649379905, + "tokens_seen": 286195712 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046124373119358074, + "loss": 3.2674, + "theoretical_loss": 4.169378781536509, + "tokens_seen": 286261248 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612337011033099, + "loss": 3.1041, + "theoretical_loss": 4.169265946763095, + "tokens_seen": 286326784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046122367101303916, + "loss": 3.232, + "theoretical_loss": 4.169153145042405, + "tokens_seen": 286392320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612136409227683, + "loss": 3.2204, + "theoretical_loss": 4.169040376357199, + "tokens_seen": 286457856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612036108324975, + "loss": 3.3238, + "theoretical_loss": 4.168927640690246, + "tokens_seen": 286523392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046119358074222665, + "loss": 3.4475, + "theoretical_loss": 4.16881493802433, + "tokens_seen": 286588928 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611835506519559, + "loss": 3.2433, + "theoretical_loss": 4.168702268342248, + "tokens_seen": 286654464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 483835, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.588104486465454, + "objective/train/theoretical_loss": 4.168589631626808, + "objective/train/tokens_used": 307180000, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046117352056168506, + "loss": 3.4583, + "theoretical_loss": 4.168589631626808, + "tokens_seen": 286720000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046116349047141425, + "loss": 3.3342, + "theoretical_loss": 4.168477027860833, + "tokens_seen": 286785536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611534603811434, + "loss": 3.2293, + "theoretical_loss": 4.168364457027158, + "tokens_seen": 286851072 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611434302908726, + "loss": 3.324, + "theoretical_loss": 4.168251919108632, + "tokens_seen": 286916608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611334002006018, + "loss": 3.2193, + "theoretical_loss": 4.168139414088113, + "tokens_seen": 286982144 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461123370110331, + "loss": 3.2605, + "theoretical_loss": 4.168026941948478, + "tokens_seen": 287047680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046111334002006015, + "loss": 3.1302, + "theoretical_loss": 4.167914502672611, + "tokens_seen": 287113216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004611033099297894, + "loss": 3.2721, + "theoretical_loss": 4.1678020962434115, + "tokens_seen": 287178752 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046109327983951857, + "loss": 3.2, + "theoretical_loss": 4.167689722643792, + "tokens_seen": 287244288 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046108324974924775, + "loss": 3.242, + "theoretical_loss": 4.1675773818566775, + "tokens_seen": 287309824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046107321965897693, + "loss": 3.1233, + "theoretical_loss": 4.167465073865006, + "tokens_seen": 287375360 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610631895687061, + "loss": 3.2729, + "theoretical_loss": 4.167352798651726, + "tokens_seen": 287440896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610531594784353, + "loss": 3.1493, + "theoretical_loss": 4.167240556199802, + "tokens_seen": 287506432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610431293881645, + "loss": 3.2647, + "theoretical_loss": 4.167128346492211, + "tokens_seen": 287571968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046103309929789365, + "loss": 3.0721, + "theoretical_loss": 4.16701616951194, + "tokens_seen": 287637504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004610230692076229, + "loss": 3.234, + "theoretical_loss": 4.1669040252419896, + "tokens_seen": 287703040 + }, + { + "epoch": 0.09, + "learning_rate": 0.000461013039117352, + "loss": 3.0508, + "theoretical_loss": 4.166791913665375, + "tokens_seen": 287768576 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046100300902708125, + "loss": 3.4299, + "theoretical_loss": 4.166679834765123, + "tokens_seen": 287834112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046099297893681043, + "loss": 3.2835, + "theoretical_loss": 4.166567788524272, + "tokens_seen": 287899648 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609829488465396, + "loss": 3.1474, + "theoretical_loss": 4.166455774925875, + "tokens_seen": 287965184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609729187562688, + "loss": 3.1485, + "theoretical_loss": 4.166343793952995, + "tokens_seen": 288030720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046096288866599803, + "loss": 3.3372, + "theoretical_loss": 4.166231845588712, + "tokens_seen": 288096256 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609528585757272, + "loss": 3.1584, + "theoretical_loss": 4.166119929816113, + "tokens_seen": 288161792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609428284854564, + "loss": 3.1632, + "theoretical_loss": 4.166008046618303, + "tokens_seen": 288227328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609327983951856, + "loss": 3.1986, + "theoretical_loss": 4.1658961959783944, + "tokens_seen": 288292864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 486829, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.20257830619812, + "objective/train/theoretical_loss": 4.165784377879517, + "objective/train/tokens_used": 308818400, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046092276830491475, + "loss": 3.1765, + "theoretical_loss": 4.165784377879517, + "tokens_seen": 288358400 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460912738214644, + "loss": 3.1548, + "theoretical_loss": 4.165672592304811, + "tokens_seen": 288423936 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609027081243731, + "loss": 3.1644, + "theoretical_loss": 4.165560839237429, + "tokens_seen": 288489472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046089267803410235, + "loss": 3.2015, + "theoretical_loss": 4.165449118660536, + "tokens_seen": 288555008 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608826479438315, + "loss": 3.3718, + "theoretical_loss": 4.16533743055731, + "tokens_seen": 288620544 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608726178535607, + "loss": 3.1688, + "theoretical_loss": 4.165225774910941, + "tokens_seen": 288686080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608625877632899, + "loss": 3.33, + "theoretical_loss": 4.165114151704634, + "tokens_seen": 288751616 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608525576730191, + "loss": 3.2742, + "theoretical_loss": 4.165002560921601, + "tokens_seen": 288817152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046084252758274826, + "loss": 3.3068, + "theoretical_loss": 4.164891002545073, + "tokens_seen": 288882688 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046083249749247744, + "loss": 3.3345, + "theoretical_loss": 4.16477947655829, + "tokens_seen": 288948224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608224674022066, + "loss": 3.1534, + "theoretical_loss": 4.164667982944504, + "tokens_seen": 289013760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046081243731193586, + "loss": 3.2002, + "theoretical_loss": 4.164556521686981, + "tokens_seen": 289079296 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460802407221665, + "loss": 3.2804, + "theoretical_loss": 4.1644450927689975, + "tokens_seen": 289144832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607923771313942, + "loss": 3.2074, + "theoretical_loss": 4.164333696173846, + "tokens_seen": 289210368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607823470411234, + "loss": 3.1862, + "theoretical_loss": 4.164222331884827, + "tokens_seen": 289275904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607723169508526, + "loss": 3.3116, + "theoretical_loss": 4.164110999885256, + "tokens_seen": 289341440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046076228686058176, + "loss": 3.4174, + "theoretical_loss": 4.163999700158462, + "tokens_seen": 289406976 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046075225677031094, + "loss": 3.2862, + "theoretical_loss": 4.163888432687784, + "tokens_seen": 289472512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607422266800401, + "loss": 3.5042, + "theoretical_loss": 4.163777197456573, + "tokens_seen": 289538048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046073219658976936, + "loss": 3.3706, + "theoretical_loss": 4.163665994448197, + "tokens_seen": 289603584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607221664994985, + "loss": 3.1515, + "theoretical_loss": 4.163554823646027, + "tokens_seen": 289669120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607121364092277, + "loss": 3.249, + "theoretical_loss": 4.163443685033458, + "tokens_seen": 289734656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046070210631895685, + "loss": 3.1277, + "theoretical_loss": 4.163332578593889, + "tokens_seen": 289800192 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606920762286861, + "loss": 3.4015, + "theoretical_loss": 4.163221504310734, + "tokens_seen": 289865728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046068204613841526, + "loss": 3.1692, + "theoretical_loss": 4.1631104621674195, + "tokens_seen": 289931264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 489827, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2091422080993652, + "objective/train/theoretical_loss": 4.162999452147384, + "objective/train/tokens_used": 310456800, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046067201604814445, + "loss": 3.2297, + "theoretical_loss": 4.162999452147384, + "tokens_seen": 289996800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606619859578736, + "loss": 3.3246, + "theoretical_loss": 4.1628884742340775, + "tokens_seen": 290062336 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606519558676028, + "loss": 3.2653, + "theoretical_loss": 4.162777528410963, + "tokens_seen": 290127872 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460641925777332, + "loss": 3.3269, + "theoretical_loss": 4.162666614661518, + "tokens_seen": 290193408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606318956870612, + "loss": 3.3253, + "theoretical_loss": 4.162555732969227, + "tokens_seen": 290258944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046062186559679035, + "loss": 2.9985, + "theoretical_loss": 4.162444883317591, + "tokens_seen": 290324480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004606118355065196, + "loss": 3.2446, + "theoretical_loss": 4.162334065690123, + "tokens_seen": 290390016 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046060180541624877, + "loss": 3.3515, + "theoretical_loss": 4.162223280070345, + "tokens_seen": 290455552 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046059177532597795, + "loss": 3.3583, + "theoretical_loss": 4.1621125264417955, + "tokens_seen": 290521088 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046058174523570713, + "loss": 3.2522, + "theoretical_loss": 4.162001804788021, + "tokens_seen": 290586624 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605717151454363, + "loss": 3.1744, + "theoretical_loss": 4.161891115092583, + "tokens_seen": 290652160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605616850551655, + "loss": 3.2102, + "theoretical_loss": 4.161780457339055, + "tokens_seen": 290717696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605516549648947, + "loss": 3.1037, + "theoretical_loss": 4.161669831511022, + "tokens_seen": 290783232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046054162487462385, + "loss": 3.3432, + "theoretical_loss": 4.16155923759208, + "tokens_seen": 290848768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605315947843531, + "loss": 3.2196, + "theoretical_loss": 4.161448675565838, + "tokens_seen": 290914304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605215646940822, + "loss": 3.3455, + "theoretical_loss": 4.161338145415918, + "tokens_seen": 290979840 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046051153460381145, + "loss": 3.1646, + "theoretical_loss": 4.161227647125955, + "tokens_seen": 291045376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046050150451354063, + "loss": 3.2854, + "theoretical_loss": 4.161117180679591, + "tokens_seen": 291110912 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604914744232698, + "loss": 3.3936, + "theoretical_loss": 4.161006746060488, + "tokens_seen": 291176448 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460481444332999, + "loss": 3.3065, + "theoretical_loss": 4.160896343252311, + "tokens_seen": 291241984 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046047141424272823, + "loss": 3.2233, + "theoretical_loss": 4.160785972238745, + "tokens_seen": 291307520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046046138415245736, + "loss": 3.2386, + "theoretical_loss": 4.160675633003484, + "tokens_seen": 291373056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604513540621866, + "loss": 3.1544, + "theoretical_loss": 4.16056532553023, + "tokens_seen": 291438592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604413239719157, + "loss": 3.3637, + "theoretical_loss": 4.160455049802706, + "tokens_seen": 291504128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046043129388164495, + "loss": 3.2991, + "theoretical_loss": 4.1603448058046375, + "tokens_seen": 291569664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 492733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4846599102020264, + "objective/train/theoretical_loss": 4.160234593519768, + "objective/train/tokens_used": 312095200, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046042126379137414, + "loss": 3.3415, + "theoretical_loss": 4.160234593519768, + "tokens_seen": 291635200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604112337011033, + "loss": 3.2213, + "theoretical_loss": 4.160124412931852, + "tokens_seen": 291700736 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004604012036108325, + "loss": 3.2588, + "theoretical_loss": 4.160014264024654, + "tokens_seen": 291766272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603911735205617, + "loss": 3.0416, + "theoretical_loss": 4.159904146781952, + "tokens_seen": 291831808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046038114343029086, + "loss": 3.1963, + "theoretical_loss": 4.159794061187536, + "tokens_seen": 291897344 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603711133400201, + "loss": 3.2623, + "theoretical_loss": 4.1596840072252075, + "tokens_seen": 291962880 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603610832497492, + "loss": 3.2708, + "theoretical_loss": 4.159573984878779, + "tokens_seen": 292028416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046035105315947846, + "loss": 3.0802, + "theoretical_loss": 4.159463994132079, + "tokens_seen": 292093952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603410230692076, + "loss": 3.3017, + "theoretical_loss": 4.15935403496894, + "tokens_seen": 292159488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603309929789368, + "loss": 3.1827, + "theoretical_loss": 4.159244107373215, + "tokens_seen": 292225024 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460320962888666, + "loss": 3.359, + "theoretical_loss": 4.159134211328765, + "tokens_seen": 292290560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004603109327983952, + "loss": 3.4973, + "theoretical_loss": 4.159024346819461, + "tokens_seen": 292356096 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046030090270812436, + "loss": 3.1381, + "theoretical_loss": 4.158914513829189, + "tokens_seen": 292421632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602908726178536, + "loss": 3.3086, + "theoretical_loss": 4.158804712341845, + "tokens_seen": 292487168 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602808425275827, + "loss": 3.1155, + "theoretical_loss": 4.158694942341338, + "tokens_seen": 292552704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046027081243731196, + "loss": 3.3241, + "theoretical_loss": 4.1585852038115885, + "tokens_seen": 292618240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602607823470411, + "loss": 3.2082, + "theoretical_loss": 4.1584754967365285, + "tokens_seen": 292683776 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602507522567703, + "loss": 3.1541, + "theoretical_loss": 4.1583658211001016, + "tokens_seen": 292749312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602407221664995, + "loss": 3.1796, + "theoretical_loss": 4.158256176886264, + "tokens_seen": 292814848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602306920762287, + "loss": 3.3778, + "theoretical_loss": 4.158146564078982, + "tokens_seen": 292880384 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046022066198595787, + "loss": 3.2331, + "theoretical_loss": 4.158036982662237, + "tokens_seen": 292945920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046021063189568705, + "loss": 3.1747, + "theoretical_loss": 4.157927432620018, + "tokens_seen": 293011456 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004602006018054163, + "loss": 3.3576, + "theoretical_loss": 4.157817913936329, + "tokens_seen": 293076992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046019057171514546, + "loss": 3.2126, + "theoretical_loss": 4.157708426595184, + "tokens_seen": 293142528 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046018054162487465, + "loss": 3.1936, + "theoretical_loss": 4.157598970580608, + "tokens_seen": 293208064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 495626, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1775639057159424, + "objective/train/theoretical_loss": 4.157489545876642, + "objective/train/tokens_used": 313733600, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601705115346038, + "loss": 3.24, + "theoretical_loss": 4.157489545876642, + "tokens_seen": 293273600 + }, + { + "epoch": 0.09, + "learning_rate": 0.000460160481444333, + "loss": 3.2181, + "theoretical_loss": 4.157380152467333, + "tokens_seen": 293339136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601504513540622, + "loss": 3.317, + "theoretical_loss": 4.157270790336742, + "tokens_seen": 293404672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601404212637914, + "loss": 3.3087, + "theoretical_loss": 4.157161459468944, + "tokens_seen": 293470208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046013039117352055, + "loss": 3.2493, + "theoretical_loss": 4.157052159848023, + "tokens_seen": 293535744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004601203610832498, + "loss": 3.2079, + "theoretical_loss": 4.156942891458074, + "tokens_seen": 293601280 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046011033099297897, + "loss": 3.2274, + "theoretical_loss": 4.156833654283207, + "tokens_seen": 293666816 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046010030090270815, + "loss": 3.324, + "theoretical_loss": 4.15672444830754, + "tokens_seen": 293732352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046009027081243733, + "loss": 3.1689, + "theoretical_loss": 4.156615273515205, + "tokens_seen": 293797888 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600802407221665, + "loss": 3.0752, + "theoretical_loss": 4.156506129890344, + "tokens_seen": 293863424 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600702106318957, + "loss": 3.2226, + "theoretical_loss": 4.156397017417111, + "tokens_seen": 293928960 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046006018054162493, + "loss": 3.3122, + "theoretical_loss": 4.156287936079675, + "tokens_seen": 293994496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046005015045135405, + "loss": 3.2343, + "theoretical_loss": 4.156178885862209, + "tokens_seen": 294060032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600401203610833, + "loss": 3.2061, + "theoretical_loss": 4.156069866748906, + "tokens_seen": 294125568 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004600300902708124, + "loss": 3.2928, + "theoretical_loss": 4.155960878723965, + "tokens_seen": 294191104 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046002006018054165, + "loss": 3.2261, + "theoretical_loss": 4.155851921771598, + "tokens_seen": 294256640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046001003009027083, + "loss": 3.2787, + "theoretical_loss": 4.155742995876029, + "tokens_seen": 294322176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046, + "loss": 3.3526, + "theoretical_loss": 4.155634101021494, + "tokens_seen": 294387712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599899699097292, + "loss": 3.2081, + "theoretical_loss": 4.155525237192238, + "tokens_seen": 294453248 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045997993981945843, + "loss": 3.141, + "theoretical_loss": 4.155416404372522, + "tokens_seen": 294518784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045996990972918756, + "loss": 3.1415, + "theoretical_loss": 4.155307602546614, + "tokens_seen": 294584320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599598796389168, + "loss": 3.3549, + "theoretical_loss": 4.155198831698795, + "tokens_seen": 294649856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599498495486459, + "loss": 3.2939, + "theoretical_loss": 4.155090091813358, + "tokens_seen": 294715392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045993981945837515, + "loss": 3.181, + "theoretical_loss": 4.154981382874608, + "tokens_seen": 294780928 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045992978936810434, + "loss": 3.2284, + "theoretical_loss": 4.154872704866859, + "tokens_seen": 294846464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 497932, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.446946620941162, + "objective/train/theoretical_loss": 4.15476405777444, + "objective/train/tokens_used": 315372000, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599197592778335, + "loss": 3.1411, + "theoretical_loss": 4.15476405777444, + "tokens_seen": 294912000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004599097291875627, + "loss": 3.2945, + "theoretical_loss": 4.154655441581687, + "tokens_seen": 294977536 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598996990972919, + "loss": 3.2329, + "theoretical_loss": 4.154546856272952, + "tokens_seen": 295043072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045988966900702106, + "loss": 3.3265, + "theoretical_loss": 4.154438301832596, + "tokens_seen": 295108608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598796389167503, + "loss": 3.0045, + "theoretical_loss": 4.154329778244991, + "tokens_seen": 295174144 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598696088264794, + "loss": 3.2591, + "theoretical_loss": 4.154221285494521, + "tokens_seen": 295239680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045985957873620866, + "loss": 3.2347, + "theoretical_loss": 4.154112823565582, + "tokens_seen": 295305216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598495486459378, + "loss": 3.0708, + "theoretical_loss": 4.15400439244258, + "tokens_seen": 295370752 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459839518555667, + "loss": 3.3149, + "theoretical_loss": 4.153895992109935, + "tokens_seen": 295436288 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598294884653962, + "loss": 3.2293, + "theoretical_loss": 4.153787622552073, + "tokens_seen": 295501824 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004598194583751254, + "loss": 3.2113, + "theoretical_loss": 4.153679283753439, + "tokens_seen": 295567360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045980942828485456, + "loss": 3.1827, + "theoretical_loss": 4.15357097569848, + "tokens_seen": 295632896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597993981945838, + "loss": 3.3137, + "theoretical_loss": 4.153462698371665, + "tokens_seen": 295698432 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597893681043129, + "loss": 3.3318, + "theoretical_loss": 4.1533544517574645, + "tokens_seen": 295763968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045977933801404216, + "loss": 3.3773, + "theoretical_loss": 4.153246235840367, + "tokens_seen": 295829504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597693079237713, + "loss": 3.1728, + "theoretical_loss": 4.153138050604868, + "tokens_seen": 295895040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597592778335005, + "loss": 3.2079, + "theoretical_loss": 4.153029896035476, + "tokens_seen": 295960576 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597492477432297, + "loss": 3.0965, + "theoretical_loss": 4.152921772116712, + "tokens_seen": 296026112 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004597392176529589, + "loss": 3.1324, + "theoretical_loss": 4.152813678833106, + "tokens_seen": 296091648 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045972918756268807, + "loss": 3.2729, + "theoretical_loss": 4.152705616169202, + "tokens_seen": 296157184 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045971915747241725, + "loss": 3.2885, + "theoretical_loss": 4.15259758410955, + "tokens_seen": 296222720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045970912738214643, + "loss": 3.3698, + "theoretical_loss": 4.152489582638719, + "tokens_seen": 296288256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045969909729187566, + "loss": 3.206, + "theoretical_loss": 4.152381611741281, + "tokens_seen": 296353792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596890672016048, + "loss": 3.3003, + "theoretical_loss": 4.152273671401824, + "tokens_seen": 296419328 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459679037111334, + "loss": 3.3153, + "theoretical_loss": 4.152165761604948, + "tokens_seen": 296484864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 500857, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4009289741516113, + "objective/train/theoretical_loss": 4.152057882335261, + "objective/train/tokens_used": 317010400, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045966900702106315, + "loss": 3.2812, + "theoretical_loss": 4.152057882335261, + "tokens_seen": 296550400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596589769307924, + "loss": 3.3087, + "theoretical_loss": 4.151950033577383, + "tokens_seen": 296615936 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045964894684052157, + "loss": 3.3613, + "theoretical_loss": 4.151842215315947, + "tokens_seen": 296681472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045963891675025075, + "loss": 3.3652, + "theoretical_loss": 4.151734427535594, + "tokens_seen": 296747008 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045962888665997993, + "loss": 3.1332, + "theoretical_loss": 4.151626670220979, + "tokens_seen": 296812544 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045961885656970917, + "loss": 3.1149, + "theoretical_loss": 4.151518943356768, + "tokens_seen": 296878080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596088264794383, + "loss": 3.1465, + "theoretical_loss": 4.151411246927636, + "tokens_seen": 296943616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045959879638916753, + "loss": 3.3355, + "theoretical_loss": 4.15130358091827, + "tokens_seen": 297009152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045958876629889666, + "loss": 3.1963, + "theoretical_loss": 4.151195945313369, + "tokens_seen": 297074688 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595787362086259, + "loss": 3.3621, + "theoretical_loss": 4.151088340097642, + "tokens_seen": 297140224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595687061183551, + "loss": 3.2064, + "theoretical_loss": 4.15098076525581, + "tokens_seen": 297205760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045955867602808425, + "loss": 3.3119, + "theoretical_loss": 4.150873220772604, + "tokens_seen": 297271296 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045954864593781344, + "loss": 3.2729, + "theoretical_loss": 4.150765706632766, + "tokens_seen": 297336832 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595386158475426, + "loss": 3.2685, + "theoretical_loss": 4.1506582228210505, + "tokens_seen": 297402368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595285857572718, + "loss": 3.1655, + "theoretical_loss": 4.150550769322221, + "tokens_seen": 297467904 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045951855566700103, + "loss": 3.3324, + "theoretical_loss": 4.150443346121054, + "tokens_seen": 297533440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045950852557673016, + "loss": 3.445, + "theoretical_loss": 4.150335953202336, + "tokens_seen": 297598976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594984954864594, + "loss": 3.2452, + "theoretical_loss": 4.150228590550864, + "tokens_seen": 297664512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594884653961885, + "loss": 3.1384, + "theoretical_loss": 4.150121258151447, + "tokens_seen": 297730048 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045947843530591776, + "loss": 3.2967, + "theoretical_loss": 4.150013955988905, + "tokens_seen": 297795584 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045946840521564694, + "loss": 3.2513, + "theoretical_loss": 4.149906684048068, + "tokens_seen": 297861120 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594583751253761, + "loss": 3.2854, + "theoretical_loss": 4.1497994423137765, + "tokens_seen": 297926656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045944834503510536, + "loss": 3.253, + "theoretical_loss": 4.149692230770884, + "tokens_seen": 297992192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045943831494483454, + "loss": 3.1959, + "theoretical_loss": 4.149585049404253, + "tokens_seen": 298057728 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594282848545637, + "loss": 3.1186, + "theoretical_loss": 4.149477898198759, + "tokens_seen": 298123264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 503633, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.375124454498291, + "objective/train/theoretical_loss": 4.149370777139286, + "objective/train/tokens_used": 318648800, + "theoretical_loss": 4.149370777139286, + "tokens_seen": 298188800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594182547642929, + "loss": 3.142, + "theoretical_loss": 4.149370777139286, + "tokens_seen": 298188800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594082246740221, + "loss": 3.28, + "theoretical_loss": 4.14926368621073, + "tokens_seen": 298254336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045939819458375126, + "loss": 3.2761, + "theoretical_loss": 4.149156625397998, + "tokens_seen": 298319872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593881644934805, + "loss": 3.1711, + "theoretical_loss": 4.149049594686008, + "tokens_seen": 298385408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593781344032096, + "loss": 3.2757, + "theoretical_loss": 4.1489425940596885, + "tokens_seen": 298450944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045936810431293886, + "loss": 3.168, + "theoretical_loss": 4.148835623503978, + "tokens_seen": 298516480 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459358074222668, + "loss": 3.3068, + "theoretical_loss": 4.148728683003829, + "tokens_seen": 298582016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593480441323972, + "loss": 3.3493, + "theoretical_loss": 4.1486217725442005, + "tokens_seen": 298647552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593380140421264, + "loss": 3.2287, + "theoretical_loss": 4.148514892110065, + "tokens_seen": 298713088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593279839518556, + "loss": 3.2606, + "theoretical_loss": 4.148408041686406, + "tokens_seen": 298778624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045931795386158476, + "loss": 3.2844, + "theoretical_loss": 4.148301221258217, + "tokens_seen": 298844160 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459307923771314, + "loss": 3.2522, + "theoretical_loss": 4.148194430810502, + "tokens_seen": 298909696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592978936810431, + "loss": 3.1122, + "theoretical_loss": 4.148087670328276, + "tokens_seen": 298975232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045928786359077236, + "loss": 3.1748, + "theoretical_loss": 4.147980939796565, + "tokens_seen": 299040768 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592778335005015, + "loss": 3.2804, + "theoretical_loss": 4.147874239200405, + "tokens_seen": 299106304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592678034102307, + "loss": 3.1784, + "theoretical_loss": 4.147767568524845, + "tokens_seen": 299171840 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592577733199599, + "loss": 3.1529, + "theoretical_loss": 4.147660927754942, + "tokens_seen": 299237376 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592477432296891, + "loss": 3.1867, + "theoretical_loss": 4.147554316875766, + "tokens_seen": 299302912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045923771313941827, + "loss": 3.2671, + "theoretical_loss": 4.147447735872396, + "tokens_seen": 299368448 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045922768304914745, + "loss": 3.2184, + "theoretical_loss": 4.147341184729921, + "tokens_seen": 299433984 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045921765295887663, + "loss": 3.0667, + "theoretical_loss": 4.147234663433444, + "tokens_seen": 299499520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045920762286860586, + "loss": 3.273, + "theoretical_loss": 4.147128171968077, + "tokens_seen": 299565056 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459197592778335, + "loss": 3.3826, + "theoretical_loss": 4.14702171031894, + "tokens_seen": 299630592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591875626880642, + "loss": 3.1591, + "theoretical_loss": 4.146915278471169, + "tokens_seen": 299696128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045917753259779335, + "loss": 3.2757, + "theoretical_loss": 4.146808876409906, + "tokens_seen": 299761664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 504999, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.281003713607788, + "objective/train/theoretical_loss": 4.146702504120305, + "objective/train/tokens_used": 320287200, + "theoretical_loss": 4.146702504120305, + "tokens_seen": 299827200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591675025075226, + "loss": 3.2764, + "theoretical_loss": 4.146702504120305, + "tokens_seen": 299827200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045915747241725177, + "loss": 3.3185, + "theoretical_loss": 4.146596161587532, + "tokens_seen": 299892736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045914744232698095, + "loss": 3.2489, + "theoretical_loss": 4.146489848796763, + "tokens_seen": 299958272 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045913741223671013, + "loss": 3.2721, + "theoretical_loss": 4.146383565733184, + "tokens_seen": 300023808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045912738214643937, + "loss": 3.3069, + "theoretical_loss": 4.146277312381991, + "tokens_seen": 300089344 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004591173520561685, + "loss": 3.2378, + "theoretical_loss": 4.1461710887283925, + "tokens_seen": 300154880 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045910732196589773, + "loss": 3.2074, + "theoretical_loss": 4.146064894757606, + "tokens_seen": 300220416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045909729187562686, + "loss": 3.215, + "theoretical_loss": 4.145958730454861, + "tokens_seen": 300285952 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590872617853561, + "loss": 3.2898, + "theoretical_loss": 4.145852595805396, + "tokens_seen": 300351488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590772316950853, + "loss": 3.023, + "theoretical_loss": 4.145746490794461, + "tokens_seen": 300417024 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045906720160481445, + "loss": 3.4011, + "theoretical_loss": 4.145640415407317, + "tokens_seen": 300482560 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045905717151454364, + "loss": 3.2439, + "theoretical_loss": 4.145534369629234, + "tokens_seen": 300548096 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590471414242728, + "loss": 3.343, + "theoretical_loss": 4.145428353445494, + "tokens_seen": 300613632 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459037111334002, + "loss": 3.1354, + "theoretical_loss": 4.145322366841389, + "tokens_seen": 300679168 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045902708124373123, + "loss": 3.3032, + "theoretical_loss": 4.145216409802221, + "tokens_seen": 300744704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045901705115346036, + "loss": 3.1079, + "theoretical_loss": 4.145110482313304, + "tokens_seen": 300810240 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590070210631896, + "loss": 3.2651, + "theoretical_loss": 4.1450045843599606, + "tokens_seen": 300875776 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589969909729187, + "loss": 3.208, + "theoretical_loss": 4.144898715927525, + "tokens_seen": 300941312 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045898696088264796, + "loss": 3.2061, + "theoretical_loss": 4.144792877001342, + "tokens_seen": 301006848 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045897693079237714, + "loss": 3.3135, + "theoretical_loss": 4.144687067566765, + "tokens_seen": 301072384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589669007021063, + "loss": 3.1593, + "theoretical_loss": 4.144581287609161, + "tokens_seen": 301137920 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589568706118355, + "loss": 3.2523, + "theoretical_loss": 4.144475537113905, + "tokens_seen": 301203456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045894684052156474, + "loss": 3.1053, + "theoretical_loss": 4.144369816066385, + "tokens_seen": 301268992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045893681043129386, + "loss": 3.3625, + "theoretical_loss": 4.144264124451995, + "tokens_seen": 301334528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589267803410231, + "loss": 3.3155, + "theoretical_loss": 4.1441584622561445, + "tokens_seen": 301400064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 507643, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3536794185638428, + "objective/train/theoretical_loss": 4.144052829464249, + "objective/train/tokens_used": 321925600, + "theoretical_loss": 4.144052829464249, + "tokens_seen": 301465600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004589167502507522, + "loss": 3.278, + "theoretical_loss": 4.144052829464249, + "tokens_seen": 301465600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045890672016048146, + "loss": 3.1807, + "theoretical_loss": 4.143947226061737, + "tokens_seen": 301531136 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045889669007021064, + "loss": 3.2503, + "theoretical_loss": 4.143841652034048, + "tokens_seen": 301596672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588866599799398, + "loss": 3.2595, + "theoretical_loss": 4.143736107366629, + "tokens_seen": 301662208 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458876629889669, + "loss": 3.3553, + "theoretical_loss": 4.14363059204494, + "tokens_seen": 301727744 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588665997993982, + "loss": 3.2875, + "theoretical_loss": 4.14352510605445, + "tokens_seen": 301793280 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045885656970912737, + "loss": 3.4194, + "theoretical_loss": 4.143419649380639, + "tokens_seen": 301858816 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588465396188566, + "loss": 3.2576, + "theoretical_loss": 4.143314222008997, + "tokens_seen": 301924352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045883650952858573, + "loss": 3.1445, + "theoretical_loss": 4.143208823925024, + "tokens_seen": 301989888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045882647943831496, + "loss": 3.1784, + "theoretical_loss": 4.143103455114231, + "tokens_seen": 302055424 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045881644934804415, + "loss": 3.2786, + "theoretical_loss": 4.142998115562139, + "tokens_seen": 302120960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004588064192577733, + "loss": 3.1728, + "theoretical_loss": 4.14289280525428, + "tokens_seen": 302186496 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587963891675025, + "loss": 3.2115, + "theoretical_loss": 4.142787524176194, + "tokens_seen": 302252032 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587863590772317, + "loss": 3.3602, + "theoretical_loss": 4.142682272313435, + "tokens_seen": 302317568 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045877632898696087, + "loss": 3.1711, + "theoretical_loss": 4.142577049651563, + "tokens_seen": 302383104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587662988966901, + "loss": 3.0686, + "theoretical_loss": 4.142471856176152, + "tokens_seen": 302448640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045875626880641923, + "loss": 2.9837, + "theoretical_loss": 4.142366691872784, + "tokens_seen": 302514176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045874623871614847, + "loss": 3.4917, + "theoretical_loss": 4.142261556727052, + "tokens_seen": 302579712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587362086258776, + "loss": 3.1597, + "theoretical_loss": 4.14215645072456, + "tokens_seen": 302645248 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045872617853560683, + "loss": 3.1219, + "theoretical_loss": 4.14205137385092, + "tokens_seen": 302710784 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458716148445336, + "loss": 3.2477, + "theoretical_loss": 4.141946326091756, + "tokens_seen": 302776320 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004587061183550652, + "loss": 3.2398, + "theoretical_loss": 4.141841307432703, + "tokens_seen": 302841856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586960882647944, + "loss": 3.3017, + "theoretical_loss": 4.1417363178594035, + "tokens_seen": 302907392 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045868605817452355, + "loss": 3.2981, + "theoretical_loss": 4.141631357357513, + "tokens_seen": 302972928 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586760280842528, + "loss": 3.1898, + "theoretical_loss": 4.141526425912694, + "tokens_seen": 303038464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 510192, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0469167232513428, + "objective/train/theoretical_loss": 4.141421523510623, + "objective/train/tokens_used": 323564000, + "theoretical_loss": 4.141421523510623, + "tokens_seen": 303104000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045866599799398197, + "loss": 3.2135, + "theoretical_loss": 4.141421523510623, + "tokens_seen": 303104000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045865596790371115, + "loss": 3.1985, + "theoretical_loss": 4.141316650136983, + "tokens_seen": 303169536 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045864593781344033, + "loss": 3.2456, + "theoretical_loss": 4.14121180577747, + "tokens_seen": 303235072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045863590772316957, + "loss": 3.2916, + "theoretical_loss": 4.141106990417789, + "tokens_seen": 303300608 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004586258776328987, + "loss": 3.2332, + "theoretical_loss": 4.141002204043654, + "tokens_seen": 303366144 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045861584754262793, + "loss": 3.2224, + "theoretical_loss": 4.140897446640793, + "tokens_seen": 303431680 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045860581745235706, + "loss": 3.3678, + "theoretical_loss": 4.1407927181949375, + "tokens_seen": 303497216 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585957873620863, + "loss": 3.2124, + "theoretical_loss": 4.140688018691835, + "tokens_seen": 303562752 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585857572718155, + "loss": 3.1656, + "theoretical_loss": 4.140583348117241, + "tokens_seen": 303628288 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045857572718154465, + "loss": 3.3435, + "theoretical_loss": 4.140478706456921, + "tokens_seen": 303693824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045856569709127384, + "loss": 3.1499, + "theoretical_loss": 4.140374093696651, + "tokens_seen": 303759360 + }, + { + "epoch": 0.09, + "learning_rate": 0.000458555667001003, + "loss": 3.2024, + "theoretical_loss": 4.1402695098222155, + "tokens_seen": 303824896 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585456369107322, + "loss": 3.2863, + "theoretical_loss": 4.140164954819412, + "tokens_seen": 303890432 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045853560682046143, + "loss": 3.1066, + "theoretical_loss": 4.140060428674046, + "tokens_seen": 303955968 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045852557673019056, + "loss": 3.2562, + "theoretical_loss": 4.139955931371932, + "tokens_seen": 304021504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585155466399198, + "loss": 3.2024, + "theoretical_loss": 4.139851462898897, + "tokens_seen": 304087040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585055165496489, + "loss": 3.1125, + "theoretical_loss": 4.139747023240777, + "tokens_seen": 304152576 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045849548645937816, + "loss": 3.2009, + "theoretical_loss": 4.139642612383418, + "tokens_seen": 304218112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045848545636910734, + "loss": 3.1398, + "theoretical_loss": 4.1395382303126755, + "tokens_seen": 304283648 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584754262788365, + "loss": 3.2684, + "theoretical_loss": 4.139433877014415, + "tokens_seen": 304349184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584653961885657, + "loss": 3.2985, + "theoretical_loss": 4.139329552474514, + "tokens_seen": 304414720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045845536609829494, + "loss": 3.3404, + "theoretical_loss": 4.139225256678857, + "tokens_seen": 304480256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045844533600802406, + "loss": 3.1915, + "theoretical_loss": 4.139120989613341, + "tokens_seen": 304545792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584353059177533, + "loss": 3.1363, + "theoretical_loss": 4.1390167512638705, + "tokens_seen": 304611328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004584252758274824, + "loss": 3.0954, + "theoretical_loss": 4.138912541616363, + "tokens_seen": 304676864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 513080, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3229451179504395, + "objective/train/theoretical_loss": 4.138808360656742, + "objective/train/tokens_used": 325202400, + "theoretical_loss": 4.138808360656742, + "tokens_seen": 304742400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045841524573721166, + "loss": 3.2209, + "theoretical_loss": 4.138808360656742, + "tokens_seen": 304742400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045840521564694084, + "loss": 3.3321, + "theoretical_loss": 4.138704208370944, + "tokens_seen": 304807936 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045839518555667, + "loss": 3.2887, + "theoretical_loss": 4.138600084744915, + "tokens_seen": 304873472 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583851554663992, + "loss": 3.1826, + "theoretical_loss": 4.1384959897646105, + "tokens_seen": 304939008 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583751253761284, + "loss": 3.1935, + "theoretical_loss": 4.138391923415996, + "tokens_seen": 305004544 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045836509528585757, + "loss": 3.1808, + "theoretical_loss": 4.138287885685045, + "tokens_seen": 305070080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583550651955868, + "loss": 3.2941, + "theoretical_loss": 4.138183876557745, + "tokens_seen": 305135616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045834503510531593, + "loss": 3.2452, + "theoretical_loss": 4.1380798960200895, + "tokens_seen": 305201152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045833500501504516, + "loss": 3.341, + "theoretical_loss": 4.137975944058083, + "tokens_seen": 305266688 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045832497492477435, + "loss": 3.0318, + "theoretical_loss": 4.137872020657742, + "tokens_seen": 305332224 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583149448345035, + "loss": 3.0752, + "theoretical_loss": 4.1377681258050885, + "tokens_seen": 305397760 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004583049147442327, + "loss": 3.0008, + "theoretical_loss": 4.13766425948616, + "tokens_seen": 305463296 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582948846539619, + "loss": 3.2498, + "theoretical_loss": 4.137560421686998, + "tokens_seen": 305528832 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045828485456369107, + "loss": 3.0782, + "theoretical_loss": 4.137456612393658, + "tokens_seen": 305594368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582748244734203, + "loss": 2.9829, + "theoretical_loss": 4.137352831592203, + "tokens_seen": 305659904 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045826479438314943, + "loss": 3.3061, + "theoretical_loss": 4.137249079268707, + "tokens_seen": 305725440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045825476429287867, + "loss": 3.1917, + "theoretical_loss": 4.137145355409253, + "tokens_seen": 305790976 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582447342026078, + "loss": 3.0803, + "theoretical_loss": 4.137041659999936, + "tokens_seen": 305856512 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045823470411233703, + "loss": 3.2626, + "theoretical_loss": 4.136937993026857, + "tokens_seen": 305922048 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582246740220662, + "loss": 3.0258, + "theoretical_loss": 4.136834354476129, + "tokens_seen": 305987584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004582146439317954, + "loss": 3.1611, + "theoretical_loss": 4.1367307443338746, + "tokens_seen": 306053120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045820461384152457, + "loss": 3.2962, + "theoretical_loss": 4.136627162586226, + "tokens_seen": 306118656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045819458375125375, + "loss": 3.2582, + "theoretical_loss": 4.136523609219327, + "tokens_seen": 306184192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045818455366098294, + "loss": 3.1617, + "theoretical_loss": 4.136420084219327, + "tokens_seen": 306249728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045817452357071217, + "loss": 3.2624, + "theoretical_loss": 4.136316587572388, + "tokens_seen": 306315264 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 514436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3893537521362305, + "objective/train/theoretical_loss": 4.136213119264681, + "objective/train/tokens_used": 326840800, + "theoretical_loss": 4.136213119264681, + "tokens_seen": 306380800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581644934804413, + "loss": 3.2763, + "theoretical_loss": 4.136213119264681, + "tokens_seen": 306380800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045815446339017053, + "loss": 3.3547, + "theoretical_loss": 4.136109679282388, + "tokens_seen": 306446336 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581444332998997, + "loss": 3.1888, + "theoretical_loss": 4.136006267611697, + "tokens_seen": 306511872 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581344032096289, + "loss": 3.1728, + "theoretical_loss": 4.135902884238812, + "tokens_seen": 306577408 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581243731193581, + "loss": 3.2606, + "theoretical_loss": 4.135799529149939, + "tokens_seen": 306642944 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045811434302908726, + "loss": 3.369, + "theoretical_loss": 4.1356962023313, + "tokens_seen": 306708480 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045810431293881644, + "loss": 3.0983, + "theoretical_loss": 4.135592903769124, + "tokens_seen": 306774016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580942828485457, + "loss": 3.1481, + "theoretical_loss": 4.135489633449649, + "tokens_seen": 306839552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580842527582748, + "loss": 3.1581, + "theoretical_loss": 4.135386391359123, + "tokens_seen": 306905088 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045807422266800404, + "loss": 3.1852, + "theoretical_loss": 4.135283177483807, + "tokens_seen": 306970624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045806419257773316, + "loss": 3.2938, + "theoretical_loss": 4.135179991809965, + "tokens_seen": 307036160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580541624874624, + "loss": 3.3464, + "theoretical_loss": 4.135076834323876, + "tokens_seen": 307101696 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580441323971916, + "loss": 3.3216, + "theoretical_loss": 4.134973705011828, + "tokens_seen": 307167232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045803410230692076, + "loss": 3.1211, + "theoretical_loss": 4.134870603860117, + "tokens_seen": 307232768 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045802407221664994, + "loss": 3.3432, + "theoretical_loss": 4.134767530855047, + "tokens_seen": 307298304 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580140421263791, + "loss": 3.092, + "theoretical_loss": 4.1346644859829365, + "tokens_seen": 307363840 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580040120361083, + "loss": 3.1999, + "theoretical_loss": 4.1345614692301105, + "tokens_seen": 307429376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045799398194583754, + "loss": 3.2377, + "theoretical_loss": 4.134458480582902, + "tokens_seen": 307494912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045798395185556667, + "loss": 3.1984, + "theoretical_loss": 4.134355520027657, + "tokens_seen": 307560448 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579739217652959, + "loss": 3.3221, + "theoretical_loss": 4.134252587550728, + "tokens_seen": 307625984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579638916750251, + "loss": 3.3228, + "theoretical_loss": 4.134149683138481, + "tokens_seen": 307691520 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045795386158475426, + "loss": 3.2926, + "theoretical_loss": 4.134046806777286, + "tokens_seen": 307757056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579438314944835, + "loss": 3.2668, + "theoretical_loss": 4.133943958453528, + "tokens_seen": 307822592 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579338014042126, + "loss": 3.2369, + "theoretical_loss": 4.133841138153597, + "tokens_seen": 307888128 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045792377131394186, + "loss": 3.2752, + "theoretical_loss": 4.133738345863896, + "tokens_seen": 307953664 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 517196, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.22224760055542, + "objective/train/theoretical_loss": 4.133635581570836, + "objective/train/tokens_used": 328479200, + "theoretical_loss": 4.133635581570836, + "tokens_seen": 308019200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045791374122367104, + "loss": 3.2333, + "theoretical_loss": 4.133635581570836, + "tokens_seen": 308019200 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579037111334002, + "loss": 3.2316, + "theoretical_loss": 4.133532845260836, + "tokens_seen": 308084736 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578936810431294, + "loss": 3.2996, + "theoretical_loss": 4.133430136920327, + "tokens_seen": 308150272 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578836509528586, + "loss": 3.1802, + "theoretical_loss": 4.133327456535749, + "tokens_seen": 308215808 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045787362086258777, + "loss": 3.2469, + "theoretical_loss": 4.13322480409355, + "tokens_seen": 308281344 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457863590772317, + "loss": 3.2956, + "theoretical_loss": 4.133122179580189, + "tokens_seen": 308346880 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045785356068204613, + "loss": 3.157, + "theoretical_loss": 4.133019582982134, + "tokens_seen": 308412416 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045784353059177536, + "loss": 3.2701, + "theoretical_loss": 4.1329170142858604, + "tokens_seen": 308477952 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045783350050150455, + "loss": 3.4144, + "theoretical_loss": 4.132814473477857, + "tokens_seen": 308543488 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578234704112337, + "loss": 3.1877, + "theoretical_loss": 4.1327119605446185, + "tokens_seen": 308609024 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578134403209629, + "loss": 3.2701, + "theoretical_loss": 4.132609475472651, + "tokens_seen": 308674560 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578034102306921, + "loss": 3.3671, + "theoretical_loss": 4.132507018248469, + "tokens_seen": 308740096 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045779338014042127, + "loss": 3.2698, + "theoretical_loss": 4.132404588858597, + "tokens_seen": 308805632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577833500501505, + "loss": 3.3915, + "theoretical_loss": 4.132302187289568, + "tokens_seen": 308871168 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045777331995987963, + "loss": 3.2847, + "theoretical_loss": 4.132199813527926, + "tokens_seen": 308936704 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045776328986960887, + "loss": 3.3211, + "theoretical_loss": 4.132097467560223, + "tokens_seen": 309002240 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457753259779338, + "loss": 3.2165, + "theoretical_loss": 4.1319951493730205, + "tokens_seen": 309067776 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045774322968906723, + "loss": 3.2606, + "theoretical_loss": 4.131892858952889, + "tokens_seen": 309133312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577331995987964, + "loss": 3.1012, + "theoretical_loss": 4.131790596286409, + "tokens_seen": 309198848 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577231695085256, + "loss": 3.2545, + "theoretical_loss": 4.1316883613601725, + "tokens_seen": 309264384 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004577131394182548, + "loss": 3.4352, + "theoretical_loss": 4.131586154160775, + "tokens_seen": 309329920 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045770310932798395, + "loss": 3.1881, + "theoretical_loss": 4.131483974674827, + "tokens_seen": 309395456 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045769307923771314, + "loss": 3.2993, + "theoretical_loss": 4.131381822888946, + "tokens_seen": 309460992 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045768304914744237, + "loss": 3.3242, + "theoretical_loss": 4.131279698789759, + "tokens_seen": 309526528 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576730190571715, + "loss": 3.2762, + "theoretical_loss": 4.1311776023639, + "tokens_seen": 309592064 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 520038, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3959357738494873, + "objective/train/theoretical_loss": 4.131075533598018, + "objective/train/tokens_used": 330117600, + "theoretical_loss": 4.131075533598018, + "tokens_seen": 309657600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045766298896690073, + "loss": 3.3427, + "theoretical_loss": 4.131075533598018, + "tokens_seen": 309657600 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576529588766299, + "loss": 3.338, + "theoretical_loss": 4.130973492478766, + "tokens_seen": 309723136 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576429287863591, + "loss": 3.3992, + "theoretical_loss": 4.130871478992807, + "tokens_seen": 309788672 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576328986960883, + "loss": 3.2491, + "theoretical_loss": 4.130769493126817, + "tokens_seen": 309854208 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045762286860581746, + "loss": 3.1507, + "theoretical_loss": 4.130667534867476, + "tokens_seen": 309919744 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045761283851554664, + "loss": 3.3534, + "theoretical_loss": 4.130565604201477, + "tokens_seen": 309985280 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576028084252759, + "loss": 3.3015, + "theoretical_loss": 4.130463701115521, + "tokens_seen": 310050816 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457592778335005, + "loss": 3.2116, + "theoretical_loss": 4.130361825596317, + "tokens_seen": 310116352 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045758274824473424, + "loss": 3.2166, + "theoretical_loss": 4.130259977630586, + "tokens_seen": 310181888 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045757271815446336, + "loss": 3.0476, + "theoretical_loss": 4.130158157205056, + "tokens_seen": 310247424 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575626880641926, + "loss": 3.1096, + "theoretical_loss": 4.130056364306465, + "tokens_seen": 310312960 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575526579739218, + "loss": 3.23, + "theoretical_loss": 4.129954598921559, + "tokens_seen": 310378496 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045754262788365096, + "loss": 3.2229, + "theoretical_loss": 4.1298528610370955, + "tokens_seen": 310444032 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045753259779338014, + "loss": 3.2415, + "theoretical_loss": 4.12975115063984, + "tokens_seen": 310509568 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575225677031093, + "loss": 2.9948, + "theoretical_loss": 4.129649467716565, + "tokens_seen": 310575104 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004575125376128385, + "loss": 3.3149, + "theoretical_loss": 4.1295478122540565, + "tokens_seen": 310640640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045750250752256774, + "loss": 3.0764, + "theoretical_loss": 4.1294461842391055, + "tokens_seen": 310706176 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045749247743229687, + "loss": 3.0559, + "theoretical_loss": 4.129344583658516, + "tokens_seen": 310771712 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574824473420261, + "loss": 3.2306, + "theoretical_loss": 4.1292430104990965, + "tokens_seen": 310837248 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574724172517553, + "loss": 3.4254, + "theoretical_loss": 4.1291414647476685, + "tokens_seen": 310902784 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045746238716148446, + "loss": 3.3266, + "theoretical_loss": 4.129039946391062, + "tokens_seen": 310968320 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045745235707121364, + "loss": 3.2865, + "theoretical_loss": 4.128938455416115, + "tokens_seen": 311033856 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574423269809428, + "loss": 3.2773, + "theoretical_loss": 4.128836991809674, + "tokens_seen": 311099392 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457432296890672, + "loss": 3.3857, + "theoretical_loss": 4.128735555558597, + "tokens_seen": 311164928 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045742226680040124, + "loss": 3.3099, + "theoretical_loss": 4.128634146649748, + "tokens_seen": 311230464 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 522843, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3568053245544434, + "objective/train/theoretical_loss": 4.128532765070004, + "objective/train/tokens_used": 331756000, + "theoretical_loss": 4.128532765070004, + "tokens_seen": 311296000 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045741223671013037, + "loss": 3.269, + "theoretical_loss": 4.128532765070004, + "tokens_seen": 311296000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004574022066198596, + "loss": 3.3598, + "theoretical_loss": 4.128431410806247, + "tokens_seen": 311361536 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045739217652958873, + "loss": 3.2458, + "theoretical_loss": 4.12833008384537, + "tokens_seen": 311427072 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045738214643931797, + "loss": 3.4328, + "theoretical_loss": 4.128228784174275, + "tokens_seen": 311492608 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045737211634904715, + "loss": 3.1779, + "theoretical_loss": 4.128127511779873, + "tokens_seen": 311558144 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045736208625877633, + "loss": 3.2182, + "theoretical_loss": 4.128026266649085, + "tokens_seen": 311623680 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004573520561685055, + "loss": 3.3602, + "theoretical_loss": 4.127925048768839, + "tokens_seen": 311689216 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045734202607823475, + "loss": 3.2433, + "theoretical_loss": 4.127823858126073, + "tokens_seen": 311754752 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045733199598796387, + "loss": 3.134, + "theoretical_loss": 4.1277226947077335, + "tokens_seen": 311820288 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004573219658976931, + "loss": 3.1605, + "theoretical_loss": 4.127621558500778, + "tokens_seen": 311885824 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045731193580742223, + "loss": 3.2235, + "theoretical_loss": 4.12752044949217, + "tokens_seen": 311951360 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045730190571715147, + "loss": 3.2163, + "theoretical_loss": 4.127419367668884, + "tokens_seen": 312016896 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045729187562688065, + "loss": 3.2045, + "theoretical_loss": 4.127318313017904, + "tokens_seen": 312082432 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045728184553660983, + "loss": 3.3063, + "theoretical_loss": 4.12721728552622, + "tokens_seen": 312147968 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457271815446339, + "loss": 3.1217, + "theoretical_loss": 4.1271162851808345, + "tokens_seen": 312213504 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572617853560682, + "loss": 3.1988, + "theoretical_loss": 4.127015311968757, + "tokens_seen": 312279040 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572517552657974, + "loss": 3.2624, + "theoretical_loss": 4.126914365877004, + "tokens_seen": 312344576 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572417251755266, + "loss": 3.2247, + "theoretical_loss": 4.126813446892607, + "tokens_seen": 312410112 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045723169508525574, + "loss": 3.2076, + "theoretical_loss": 4.1267125550026, + "tokens_seen": 312475648 + }, + { + "epoch": 0.09, + "learning_rate": 0.000457221664994985, + "loss": 3.3104, + "theoretical_loss": 4.1266116901940295, + "tokens_seen": 312541184 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004572116349047141, + "loss": 3.0543, + "theoretical_loss": 4.126510852453949, + "tokens_seen": 312606720 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045720160481444334, + "loss": 3.2875, + "theoretical_loss": 4.126410041769423, + "tokens_seen": 312672256 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045719157472417257, + "loss": 3.2166, + "theoretical_loss": 4.126309258127524, + "tokens_seen": 312737792 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571815446339017, + "loss": 3.2035, + "theoretical_loss": 4.126208501515331, + "tokens_seen": 312803328 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045717151454363093, + "loss": 3.1913, + "theoretical_loss": 4.126107771919935, + "tokens_seen": 312868864 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 525512, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.055954933166504, + "objective/train/theoretical_loss": 4.126007069328436, + "objective/train/tokens_used": 333394400, + "theoretical_loss": 4.126007069328436, + "tokens_seen": 312934400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571614844533601, + "loss": 3.1845, + "theoretical_loss": 4.126007069328436, + "tokens_seen": 312934400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571514543630893, + "loss": 3.3533, + "theoretical_loss": 4.125906393727941, + "tokens_seen": 312999936 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571414242728185, + "loss": 3.2977, + "theoretical_loss": 4.125805745105566, + "tokens_seen": 313065472 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045713139418254766, + "loss": 3.2938, + "theoretical_loss": 4.125705123448437, + "tokens_seen": 313131008 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045712136409227684, + "loss": 3.1551, + "theoretical_loss": 4.125604528743689, + "tokens_seen": 313196544 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571113340020061, + "loss": 3.304, + "theoretical_loss": 4.125503960978464, + "tokens_seen": 313262080 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004571013039117352, + "loss": 3.1121, + "theoretical_loss": 4.1254034201399135, + "tokens_seen": 313327616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045709127382146444, + "loss": 3.2713, + "theoretical_loss": 4.125302906215199, + "tokens_seen": 313393152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045708124373119356, + "loss": 3.2026, + "theoretical_loss": 4.12520241919149, + "tokens_seen": 313458688 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570712136409228, + "loss": 3.2861, + "theoretical_loss": 4.125101959055965, + "tokens_seen": 313524224 + }, + { + "epoch": 0.1, + "learning_rate": 0.000457061183550652, + "loss": 3.3836, + "theoretical_loss": 4.125001525795811, + "tokens_seen": 313589760 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045705115346038116, + "loss": 3.2514, + "theoretical_loss": 4.124901119398222, + "tokens_seen": 313655296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045704112337011034, + "loss": 3.1672, + "theoretical_loss": 4.124800739850406, + "tokens_seen": 313720832 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570310932798395, + "loss": 3.0066, + "theoretical_loss": 4.124700387139574, + "tokens_seen": 313786368 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004570210631895687, + "loss": 3.337, + "theoretical_loss": 4.12460006125295, + "tokens_seen": 313851904 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045701103309929794, + "loss": 3.2564, + "theoretical_loss": 4.124499762177764, + "tokens_seen": 313917440 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045700100300902707, + "loss": 3.2957, + "theoretical_loss": 4.124399489901254, + "tokens_seen": 313982976 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569909729187563, + "loss": 3.3472, + "theoretical_loss": 4.124299244410672, + "tokens_seen": 314048512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569809428284855, + "loss": 3.194, + "theoretical_loss": 4.124199025693272, + "tokens_seen": 314114048 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045697091273821466, + "loss": 3.0069, + "theoretical_loss": 4.124098833736321, + "tokens_seen": 314179584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045696088264794384, + "loss": 3.1095, + "theoretical_loss": 4.123998668527094, + "tokens_seen": 314245120 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456950852557673, + "loss": 3.2601, + "theoretical_loss": 4.123898530052874, + "tokens_seen": 314310656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569408224674022, + "loss": 3.163, + "theoretical_loss": 4.123798418300953, + "tokens_seen": 314376192 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045693079237713144, + "loss": 3.1817, + "theoretical_loss": 4.123698333258631, + "tokens_seen": 314441728 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045692076228686057, + "loss": 3.3328, + "theoretical_loss": 4.123598274913219, + "tokens_seen": 314507264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 528422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.258194923400879, + "objective/train/theoretical_loss": 4.123498243252032, + "objective/train/tokens_used": 335032800, + "theoretical_loss": 4.123498243252032, + "tokens_seen": 314572800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004569107321965898, + "loss": 3.2314, + "theoretical_loss": 4.123498243252032, + "tokens_seen": 314572800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045690070210631893, + "loss": 3.1554, + "theoretical_loss": 4.1233982382624, + "tokens_seen": 314638336 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045689067201604817, + "loss": 3.277, + "theoretical_loss": 4.123298259931657, + "tokens_seen": 314703872 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045688064192577735, + "loss": 3.3335, + "theoretical_loss": 4.123198308247146, + "tokens_seen": 314769408 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045687061183550653, + "loss": 3.2308, + "theoretical_loss": 4.123098383196222, + "tokens_seen": 314834944 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004568605817452357, + "loss": 3.3428, + "theoretical_loss": 4.122998484766244, + "tokens_seen": 314900480 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045685055165496495, + "loss": 3.1711, + "theoretical_loss": 4.122898612944582, + "tokens_seen": 314966016 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045684052156469407, + "loss": 3.2353, + "theoretical_loss": 4.122798767718616, + "tokens_seen": 315031552 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004568304914744233, + "loss": 3.2583, + "theoretical_loss": 4.122698949075732, + "tokens_seen": 315097088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045682046138415243, + "loss": 3.3045, + "theoretical_loss": 4.122599157003327, + "tokens_seen": 315162624 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045681043129388167, + "loss": 3.1544, + "theoretical_loss": 4.1224993914888035, + "tokens_seen": 315228160 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045680040120361085, + "loss": 3.2556, + "theoretical_loss": 4.122399652519576, + "tokens_seen": 315293696 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045679037111334003, + "loss": 3.4348, + "theoretical_loss": 4.122299940083065, + "tokens_seen": 315359232 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567803410230692, + "loss": 3.143, + "theoretical_loss": 4.1222002541667, + "tokens_seen": 315424768 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567703109327984, + "loss": 3.2333, + "theoretical_loss": 4.122100594757921, + "tokens_seen": 315490304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567602808425276, + "loss": 3.2574, + "theoretical_loss": 4.122000961844175, + "tokens_seen": 315555840 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567502507522568, + "loss": 3.2248, + "theoretical_loss": 4.121901355412917, + "tokens_seen": 315621376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045674022066198594, + "loss": 3.1932, + "theoretical_loss": 4.121801775451612, + "tokens_seen": 315686912 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567301905717152, + "loss": 3.3079, + "theoretical_loss": 4.121702221947732, + "tokens_seen": 315752448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567201604814443, + "loss": 3.401, + "theoretical_loss": 4.121602694888759, + "tokens_seen": 315817984 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045671013039117354, + "loss": 3.3611, + "theoretical_loss": 4.121503194262183, + "tokens_seen": 315883520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004567001003009027, + "loss": 3.2084, + "theoretical_loss": 4.121403720055502, + "tokens_seen": 315949056 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566900702106319, + "loss": 3.2166, + "theoretical_loss": 4.121304272256222, + "tokens_seen": 316014592 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566800401203611, + "loss": 3.2755, + "theoretical_loss": 4.121204850851861, + "tokens_seen": 316080128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566700100300903, + "loss": 3.0971, + "theoretical_loss": 4.121105455829939, + "tokens_seen": 316145664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 531254, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.064321517944336, + "objective/train/theoretical_loss": 4.121006087177992, + "objective/train/tokens_used": 336671200, + "theoretical_loss": 4.121006087177992, + "tokens_seen": 316211200 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045665997993981944, + "loss": 3.1859, + "theoretical_loss": 4.121006087177992, + "tokens_seen": 316211200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566499498495487, + "loss": 2.9615, + "theoretical_loss": 4.120906744883559, + "tokens_seen": 316276736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566399197592778, + "loss": 3.336, + "theoretical_loss": 4.120807428934189, + "tokens_seen": 316342272 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045662988966900704, + "loss": 3.1616, + "theoretical_loss": 4.120708139317441, + "tokens_seen": 316407808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566198595787362, + "loss": 3.2708, + "theoretical_loss": 4.12060887602088, + "tokens_seen": 316473344 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004566098294884654, + "loss": 3.1956, + "theoretical_loss": 4.120509639032081, + "tokens_seen": 316538880 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565997993981946, + "loss": 3.1647, + "theoretical_loss": 4.120410428338628, + "tokens_seen": 316604416 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045658976930792376, + "loss": 3.24, + "theoretical_loss": 4.120311243928111, + "tokens_seen": 316669952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045657973921765294, + "loss": 3.256, + "theoretical_loss": 4.120212085788131, + "tokens_seen": 316735488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565697091273822, + "loss": 3.2696, + "theoretical_loss": 4.120112953906296, + "tokens_seen": 316801024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565596790371113, + "loss": 3.3948, + "theoretical_loss": 4.120013848270222, + "tokens_seen": 316866560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045654964894684054, + "loss": 3.2885, + "theoretical_loss": 4.119914768867536, + "tokens_seen": 316932096 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045653961885656967, + "loss": 3.2378, + "theoretical_loss": 4.11981571568587, + "tokens_seen": 316997632 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565295887662989, + "loss": 3.1772, + "theoretical_loss": 4.119716688712866, + "tokens_seen": 317063168 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565195586760281, + "loss": 3.055, + "theoretical_loss": 4.119617687936175, + "tokens_seen": 317128704 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045650952858575727, + "loss": 3.1641, + "theoretical_loss": 4.119518713343455, + "tokens_seen": 317194240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045649949849548645, + "loss": 3.4017, + "theoretical_loss": 4.119419764922374, + "tokens_seen": 317259776 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564894684052157, + "loss": 3.057, + "theoretical_loss": 4.119320842660606, + "tokens_seen": 317325312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564794383149448, + "loss": 3.167, + "theoretical_loss": 4.119221946545836, + "tokens_seen": 317390848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045646940822467405, + "loss": 3.4474, + "theoretical_loss": 4.119123076565755, + "tokens_seen": 317456384 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045645937813440317, + "loss": 3.16, + "theoretical_loss": 4.119024232708064, + "tokens_seen": 317521920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564493480441324, + "loss": 3.1897, + "theoretical_loss": 4.118925414960472, + "tokens_seen": 317587456 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045643931795386164, + "loss": 3.3025, + "theoretical_loss": 4.118826623310696, + "tokens_seen": 317652992 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045642928786359077, + "loss": 3.3093, + "theoretical_loss": 4.11872785774646, + "tokens_seen": 317718528 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045641925777332, + "loss": 3.1864, + "theoretical_loss": 4.1186291182555, + "tokens_seen": 317784064 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 532742, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.390580177307129, + "objective/train/theoretical_loss": 4.118530404825556, + "objective/train/tokens_used": 338309600, + "theoretical_loss": 4.118530404825556, + "tokens_seen": 317849600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045640922768304913, + "loss": 3.2205, + "theoretical_loss": 4.118530404825556, + "tokens_seen": 317849600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045639919759277837, + "loss": 3.1004, + "theoretical_loss": 4.11843171744438, + "tokens_seen": 317915136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045638916750250755, + "loss": 3.3318, + "theoretical_loss": 4.118333056099728, + "tokens_seen": 317980672 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045637913741223673, + "loss": 3.1802, + "theoretical_loss": 4.11823442077937, + "tokens_seen": 318046208 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004563691073219659, + "loss": 3.1751, + "theoretical_loss": 4.1181358114710775, + "tokens_seen": 318111744 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045635907723169515, + "loss": 3.0754, + "theoretical_loss": 4.1180372281626365, + "tokens_seen": 318177280 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045634904714142427, + "loss": 3.133, + "theoretical_loss": 4.117938670841838, + "tokens_seen": 318242816 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004563390170511535, + "loss": 3.1666, + "theoretical_loss": 4.117840139496482, + "tokens_seen": 318308352 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045632898696088263, + "loss": 3.1426, + "theoretical_loss": 4.117741634114376, + "tokens_seen": 318373888 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045631895687061187, + "loss": 3.3052, + "theoretical_loss": 4.1176431546833365, + "tokens_seen": 318439424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045630892678034105, + "loss": 3.2731, + "theoretical_loss": 4.117544701191187, + "tokens_seen": 318504960 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045629889669007023, + "loss": 3.2737, + "theoretical_loss": 4.117446273625763, + "tokens_seen": 318570496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562888665997994, + "loss": 3.2656, + "theoretical_loss": 4.117347871974903, + "tokens_seen": 318636032 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562788365095286, + "loss": 3.3922, + "theoretical_loss": 4.1172494962264565, + "tokens_seen": 318701568 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562688064192578, + "loss": 3.1591, + "theoretical_loss": 4.117151146368282, + "tokens_seen": 318767104 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456258776328987, + "loss": 3.2836, + "theoretical_loss": 4.117052822388243, + "tokens_seen": 318832640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045624874623871614, + "loss": 3.3002, + "theoretical_loss": 4.116954524274216, + "tokens_seen": 318898176 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562387161484454, + "loss": 3.1038, + "theoretical_loss": 4.11685625201408, + "tokens_seen": 318963712 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562286860581745, + "loss": 3.1742, + "theoretical_loss": 4.116758005595727, + "tokens_seen": 319029248 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045621865596790374, + "loss": 3.342, + "theoretical_loss": 4.116659785007055, + "tokens_seen": 319094784 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562086258776329, + "loss": 3.3438, + "theoretical_loss": 4.116561590235969, + "tokens_seen": 319160320 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561985957873621, + "loss": 3.2334, + "theoretical_loss": 4.116463421270385, + "tokens_seen": 319225856 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561885656970913, + "loss": 3.1666, + "theoretical_loss": 4.116365278098225, + "tokens_seen": 319291392 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561785356068205, + "loss": 3.1571, + "theoretical_loss": 4.116267160707421, + "tokens_seen": 319356928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045616850551654964, + "loss": 2.9918, + "theoretical_loss": 4.11616906908591, + "tokens_seen": 319422464 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 535531, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2355546951293945, + "objective/train/theoretical_loss": 4.11607100322164, + "objective/train/tokens_used": 339948000, + "theoretical_loss": 4.11607100322164, + "tokens_seen": 319488000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561584754262789, + "loss": 3.2201, + "theoretical_loss": 4.11607100322164, + "tokens_seen": 319488000 + }, + { + "epoch": 0.1, + "learning_rate": 0.000456148445336008, + "loss": 3.1606, + "theoretical_loss": 4.115972963102565, + "tokens_seen": 319553536 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045613841524573724, + "loss": 3.3155, + "theoretical_loss": 4.11587494871665, + "tokens_seen": 319619072 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561283851554664, + "loss": 3.1255, + "theoretical_loss": 4.115776960051864, + "tokens_seen": 319684608 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561183550651956, + "loss": 3.2613, + "theoretical_loss": 4.11567899709619, + "tokens_seen": 319750144 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561083249749248, + "loss": 2.9822, + "theoretical_loss": 4.115581059837612, + "tokens_seen": 319815680 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045609829488465396, + "loss": 3.1753, + "theoretical_loss": 4.115483148264127, + "tokens_seen": 319881216 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045608826479438314, + "loss": 3.3311, + "theoretical_loss": 4.115385262363739, + "tokens_seen": 319946752 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560782347041124, + "loss": 3.182, + "theoretical_loss": 4.1152874021244585, + "tokens_seen": 320012288 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560682046138415, + "loss": 3.1631, + "theoretical_loss": 4.115189567534307, + "tokens_seen": 320077824 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045605817452357074, + "loss": 3.2866, + "theoretical_loss": 4.115091758581309, + "tokens_seen": 320143360 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045604814443329987, + "loss": 3.2554, + "theoretical_loss": 4.114993975253505, + "tokens_seen": 320208896 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560381143430291, + "loss": 3.2154, + "theoretical_loss": 4.114896217538935, + "tokens_seen": 320274432 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004560280842527583, + "loss": 3.2867, + "theoretical_loss": 4.114798485425652, + "tokens_seen": 320339968 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045601805416248747, + "loss": 3.2792, + "theoretical_loss": 4.114700778901717, + "tokens_seen": 320405504 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045600802407221665, + "loss": 3.0472, + "theoretical_loss": 4.114603097955197, + "tokens_seen": 320471040 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559979939819459, + "loss": 3.3205, + "theoretical_loss": 4.114505442574167, + "tokens_seen": 320536576 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455987963891675, + "loss": 3.2939, + "theoretical_loss": 4.1144078127467125, + "tokens_seen": 320602112 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045597793380140425, + "loss": 3.2443, + "theoretical_loss": 4.114310208460924, + "tokens_seen": 320667648 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045596790371113337, + "loss": 3.209, + "theoretical_loss": 4.114212629704902, + "tokens_seen": 320733184 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559578736208626, + "loss": 3.2057, + "theoretical_loss": 4.114115076466755, + "tokens_seen": 320798720 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559478435305918, + "loss": 3.1821, + "theoretical_loss": 4.114017548734598, + "tokens_seen": 320864256 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045593781344032097, + "loss": 3.3201, + "theoretical_loss": 4.113920046496554, + "tokens_seen": 320929792 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045592778335005015, + "loss": 3.2319, + "theoretical_loss": 4.113822569740757, + "tokens_seen": 320995328 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045591775325977933, + "loss": 3.2853, + "theoretical_loss": 4.113725118455344, + "tokens_seen": 321060864 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3203237056732178, + "objective/train/theoretical_loss": 4.113627692628464, + "objective/train/tokens_used": 341586400, + "theoretical_loss": 4.113627692628464, + "tokens_seen": 321126400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004559077231695085, + "loss": 3.1105, + "theoretical_loss": 4.113627692628464, + "tokens_seen": 321126400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045589769307923775, + "loss": 3.2472, + "theoretical_loss": 4.113530292248273, + "tokens_seen": 321191936 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558876629889669, + "loss": 3.2024, + "theoretical_loss": 4.113432917302934, + "tokens_seen": 321257472 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558776328986961, + "loss": 3.3586, + "theoretical_loss": 4.113335567780618, + "tokens_seen": 321323008 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045586760280842524, + "loss": 3.2226, + "theoretical_loss": 4.113238243669504, + "tokens_seen": 321388544 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558575727181545, + "loss": 3.1224, + "theoretical_loss": 4.113140944957781, + "tokens_seen": 321454080 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045584754262788365, + "loss": 3.4267, + "theoretical_loss": 4.113043671633641, + "tokens_seen": 321519616 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045583751253761284, + "loss": 3.0998, + "theoretical_loss": 4.11294642368529, + "tokens_seen": 321585152 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455827482447342, + "loss": 3.1206, + "theoretical_loss": 4.112849201100938, + "tokens_seen": 321650688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045581745235707125, + "loss": 3.162, + "theoretical_loss": 4.1127520038688035, + "tokens_seen": 321716224 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004558074222668004, + "loss": 3.1964, + "theoretical_loss": 4.112654831977112, + "tokens_seen": 321781760 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557973921765296, + "loss": 3.2131, + "theoretical_loss": 4.1125576854141, + "tokens_seen": 321847296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045578736208625874, + "loss": 3.2353, + "theoretical_loss": 4.112460564168009, + "tokens_seen": 321912832 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455777331995988, + "loss": 3.1361, + "theoretical_loss": 4.112363468227088, + "tokens_seen": 321978368 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045576730190571716, + "loss": 3.2599, + "theoretical_loss": 4.112266397579598, + "tokens_seen": 322043904 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045575727181544634, + "loss": 3.3002, + "theoretical_loss": 4.112169352213801, + "tokens_seen": 322109440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557472417251755, + "loss": 3.1576, + "theoretical_loss": 4.1120723321179735, + "tokens_seen": 322174976 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557372116349047, + "loss": 3.1994, + "theoretical_loss": 4.111975337280397, + "tokens_seen": 322240512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557271815446339, + "loss": 3.319, + "theoretical_loss": 4.111878367689359, + "tokens_seen": 322306048 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004557171514543631, + "loss": 3.2206, + "theoretical_loss": 4.1117814233331575, + "tokens_seen": 322371584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045570712136409224, + "loss": 3.2422, + "theoretical_loss": 4.111684504200099, + "tokens_seen": 322437120 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556970912738215, + "loss": 2.9533, + "theoretical_loss": 4.111587610278494, + "tokens_seen": 322502656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556870611835507, + "loss": 3.0091, + "theoretical_loss": 4.111490741556663, + "tokens_seen": 322568192 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045567703109327984, + "loss": 3.1844, + "theoretical_loss": 4.1113938980229365, + "tokens_seen": 322633728 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556670010030091, + "loss": 3.2474, + "theoretical_loss": 4.11129707966565, + "tokens_seen": 322699264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.350184917449951, + "objective/train/theoretical_loss": 4.111200286473145, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.111200286473145, + "tokens_seen": 322764800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556569709127382, + "loss": 3.195, + "theoretical_loss": 4.111200286473145, + "tokens_seen": 322764800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045564694082246744, + "loss": 3.2746, + "theoretical_loss": 4.111103518433776, + "tokens_seen": 322830336 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556369107321966, + "loss": 3.1853, + "theoretical_loss": 4.111006775535901, + "tokens_seen": 322895872 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556268806419258, + "loss": 3.0912, + "theoretical_loss": 4.110910057767887, + "tokens_seen": 322961408 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455616850551655, + "loss": 3.2097, + "theoretical_loss": 4.110813365118109, + "tokens_seen": 323026944 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045560682046138416, + "loss": 3.2755, + "theoretical_loss": 4.110716697574951, + "tokens_seen": 323092480 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045559679037111334, + "loss": 3.2227, + "theoretical_loss": 4.110620055126802, + "tokens_seen": 323158016 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555867602808426, + "loss": 3.3381, + "theoretical_loss": 4.110523437762059, + "tokens_seen": 323223552 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555767301905717, + "loss": 3.1282, + "theoretical_loss": 4.11042684546913, + "tokens_seen": 323289088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045556670010030094, + "loss": 3.2157, + "theoretical_loss": 4.110330278236427, + "tokens_seen": 323354624 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045555667001003007, + "loss": 3.2323, + "theoretical_loss": 4.110233736052372, + "tokens_seen": 323420160 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555466399197593, + "loss": 3.1831, + "theoretical_loss": 4.110137218905393, + "tokens_seen": 323485696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555366098294885, + "loss": 3.2589, + "theoretical_loss": 4.110040726783927, + "tokens_seen": 323551232 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045552657973921767, + "loss": 3.3196, + "theoretical_loss": 4.109944259676419, + "tokens_seen": 323616768 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045551654964894685, + "loss": 3.3433, + "theoretical_loss": 4.109847817571319, + "tokens_seen": 323682304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004555065195586761, + "loss": 3.1517, + "theoretical_loss": 4.109751400457089, + "tokens_seen": 323747840 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554964894684052, + "loss": 3.2057, + "theoretical_loss": 4.109655008322195, + "tokens_seen": 323813376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045548645937813445, + "loss": 3.0907, + "theoretical_loss": 4.109558641155112, + "tokens_seen": 323878912 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045547642928786357, + "loss": 3.4015, + "theoretical_loss": 4.109462298944322, + "tokens_seen": 323944448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554663991975928, + "loss": 3.2771, + "theoretical_loss": 4.109365981678316, + "tokens_seen": 324009984 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455456369107322, + "loss": 3.2206, + "theoretical_loss": 4.109269689345592, + "tokens_seen": 324075520 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045544633901705117, + "loss": 3.2941, + "theoretical_loss": 4.109173421934654, + "tokens_seen": 324141056 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045543630892678035, + "loss": 3.2045, + "theoretical_loss": 4.109077179434016, + "tokens_seen": 324206592 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045542627883650953, + "loss": 3.2024, + "theoretical_loss": 4.1089809618321995, + "tokens_seen": 324272128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554162487462387, + "loss": 3.4802, + "theoretical_loss": 4.108884769117731, + "tokens_seen": 324337664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1660821437835693, + "objective/train/theoretical_loss": 4.108788601279149, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.108788601279149, + "tokens_seen": 324403200 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045540621865596795, + "loss": 3.2654, + "theoretical_loss": 4.108788601279149, + "tokens_seen": 324403200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553961885656971, + "loss": 3.3852, + "theoretical_loss": 4.108692458304994, + "tokens_seen": 324468736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553861584754263, + "loss": 3.1101, + "theoretical_loss": 4.108596340183819, + "tokens_seen": 324534272 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045537612838515544, + "loss": 3.0887, + "theoretical_loss": 4.108500246904184, + "tokens_seen": 324599808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553660982948847, + "loss": 3.2253, + "theoretical_loss": 4.108404178454651, + "tokens_seen": 324665344 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045535606820461385, + "loss": 3.2317, + "theoretical_loss": 4.1083081348237975, + "tokens_seen": 324730880 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045534603811434304, + "loss": 3.2019, + "theoretical_loss": 4.108212116000203, + "tokens_seen": 324796416 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553360080240722, + "loss": 3.2089, + "theoretical_loss": 4.108116121972457, + "tokens_seen": 324861952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045532597793380145, + "loss": 3.1375, + "theoretical_loss": 4.108020152729157, + "tokens_seen": 324927488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553159478435306, + "loss": 3.1287, + "theoretical_loss": 4.107924208258905, + "tokens_seen": 324993024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553059177532598, + "loss": 3.2768, + "theoretical_loss": 4.107828288550314, + "tokens_seen": 325058560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045529588766298894, + "loss": 3.179, + "theoretical_loss": 4.107732393592003, + "tokens_seen": 325124096 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552858575727182, + "loss": 3.1553, + "theoretical_loss": 4.107636523372598, + "tokens_seen": 325189632 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045527582748244736, + "loss": 3.1557, + "theoretical_loss": 4.107540677880734, + "tokens_seen": 325255168 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045526579739217654, + "loss": 3.2335, + "theoretical_loss": 4.107444857105052, + "tokens_seen": 325320704 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552557673019057, + "loss": 3.0586, + "theoretical_loss": 4.107349061034201, + "tokens_seen": 325386240 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552457372116349, + "loss": 3.1236, + "theoretical_loss": 4.107253289656838, + "tokens_seen": 325451776 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552357071213641, + "loss": 3.1803, + "theoretical_loss": 4.107157542961628, + "tokens_seen": 325517312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552256770310933, + "loss": 3.1237, + "theoretical_loss": 4.10706182093724, + "tokens_seen": 325582848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045521564694082244, + "loss": 3.0213, + "theoretical_loss": 4.106966123572356, + "tokens_seen": 325648384 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004552056168505517, + "loss": 3.2426, + "theoretical_loss": 4.106870450855661, + "tokens_seen": 325713920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551955867602808, + "loss": 3.363, + "theoretical_loss": 4.106774802775849, + "tokens_seen": 325779456 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045518555667001004, + "loss": 3.2612, + "theoretical_loss": 4.106679179321622, + "tokens_seen": 325844992 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551755265797392, + "loss": 3.2478, + "theoretical_loss": 4.106583580481689, + "tokens_seen": 325910528 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551654964894684, + "loss": 3.2782, + "theoretical_loss": 4.106488006244767, + "tokens_seen": 325976064 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.218202590942383, + "objective/train/theoretical_loss": 4.106392456599577, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.106392456599577, + "tokens_seen": 326041600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551554663991976, + "loss": 3.1963, + "theoretical_loss": 4.106392456599577, + "tokens_seen": 326041600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551454363089268, + "loss": 2.9129, + "theoretical_loss": 4.106296931534854, + "tokens_seen": 326107136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045513540621865595, + "loss": 3.1724, + "theoretical_loss": 4.106201431039335, + "tokens_seen": 326172672 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551253761283852, + "loss": 3.2777, + "theoretical_loss": 4.106105955101766, + "tokens_seen": 326238208 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004551153460381143, + "loss": 3.2442, + "theoretical_loss": 4.1060105037109, + "tokens_seen": 326303744 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045510531594784354, + "loss": 3.1887, + "theoretical_loss": 4.105915076855499, + "tokens_seen": 326369280 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550952858575727, + "loss": 3.3205, + "theoretical_loss": 4.105819674524332, + "tokens_seen": 326434816 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550852557673019, + "loss": 3.0339, + "theoretical_loss": 4.105724296706172, + "tokens_seen": 326500352 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550752256770311, + "loss": 3.1569, + "theoretical_loss": 4.105628943389805, + "tokens_seen": 326565888 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045506519558676027, + "loss": 3.1572, + "theoretical_loss": 4.1055336145640196, + "tokens_seen": 326631424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045505516549648945, + "loss": 3.3293, + "theoretical_loss": 4.105438310217615, + "tokens_seen": 326696960 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550451354062187, + "loss": 3.0165, + "theoretical_loss": 4.105343030339395, + "tokens_seen": 326762496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550351053159478, + "loss": 3.264, + "theoretical_loss": 4.1052477749181735, + "tokens_seen": 326828032 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045502507522567705, + "loss": 3.2845, + "theoretical_loss": 4.10515254394277, + "tokens_seen": 326893568 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045501504513540623, + "loss": 3.2235, + "theoretical_loss": 4.1050573374020125, + "tokens_seen": 326959104 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550050150451354, + "loss": 3.2896, + "theoretical_loss": 4.104962155284734, + "tokens_seen": 327024640 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549949849548646, + "loss": 3.0766, + "theoretical_loss": 4.104866997579778, + "tokens_seen": 327090176 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045498495486459377, + "loss": 3.1934, + "theoretical_loss": 4.104771864275993, + "tokens_seen": 327155712 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045497492477432295, + "loss": 3.3142, + "theoretical_loss": 4.104676755362237, + "tokens_seen": 327221248 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549648946840522, + "loss": 3.1229, + "theoretical_loss": 4.104581670827372, + "tokens_seen": 327286784 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045495486459378137, + "loss": 3.1308, + "theoretical_loss": 4.10448661066027, + "tokens_seen": 327352320 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045494483450351055, + "loss": 3.0286, + "theoretical_loss": 4.104391574849812, + "tokens_seen": 327417856 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045493480441323973, + "loss": 3.0612, + "theoretical_loss": 4.10429656338488, + "tokens_seen": 327483392 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549247743229689, + "loss": 3.2322, + "theoretical_loss": 4.104201576254369, + "tokens_seen": 327548928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045491474423269815, + "loss": 3.1844, + "theoretical_loss": 4.10410661344718, + "tokens_seen": 327614464 + }, + { + "debugging/Self-BLEU-5": 0.6066847106359012, + "debugging/distinct-1-grams": 0.7360367063174506, + "debugging/distinct-2-grams": 0.9459259794780506, + "debugging/entropy-1-grams": 6.10782003058958, + "debugging/entropy-2-grams": 7.199497578617988, + "debugging/length": 548.578947368421, + "debugging/num_segments": 19, + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.107281446456909, + "objective/train/theoretical_loss": 4.10401167495222, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.10401167495222, + "tokens_seen": 327680000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549047141424273, + "loss": 3.064, + "theoretical_loss": 4.10401167495222, + "tokens_seen": 327680000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548946840521565, + "loss": 3.3277, + "theoretical_loss": 4.103916760758405, + "tokens_seen": 327745536 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045488465396188564, + "loss": 3.3989, + "theoretical_loss": 4.103821870854656, + "tokens_seen": 327811072 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548746238716149, + "loss": 3.206, + "theoretical_loss": 4.103727005229903, + "tokens_seen": 327876608 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045486459378134405, + "loss": 3.188, + "theoretical_loss": 4.103632163873083, + "tokens_seen": 327942144 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045485456369107324, + "loss": 3.2548, + "theoretical_loss": 4.10353734677314, + "tokens_seen": 328007680 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548445336008024, + "loss": 3.2377, + "theoretical_loss": 4.103442553919026, + "tokens_seen": 328073216 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045483450351053165, + "loss": 3.3397, + "theoretical_loss": 4.1033477852996985, + "tokens_seen": 328138752 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548244734202608, + "loss": 3.118, + "theoretical_loss": 4.103253040904124, + "tokens_seen": 328204288 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045481444332999, + "loss": 3.1345, + "theoretical_loss": 4.103158320721276, + "tokens_seen": 328269824 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045480441323971914, + "loss": 3.3057, + "theoretical_loss": 4.103063624740133, + "tokens_seen": 328335360 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547943831494484, + "loss": 3.2931, + "theoretical_loss": 4.102968952949684, + "tokens_seen": 328400896 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045478435305917756, + "loss": 3.3025, + "theoretical_loss": 4.102874305338923, + "tokens_seen": 328466432 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045477432296890674, + "loss": 3.1946, + "theoretical_loss": 4.102779681896852, + "tokens_seen": 328531968 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547642928786359, + "loss": 3.2114, + "theoretical_loss": 4.10268508261248, + "tokens_seen": 328597504 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547542627883651, + "loss": 3.2089, + "theoretical_loss": 4.102590507474824, + "tokens_seen": 328663040 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547442326980943, + "loss": 2.9963, + "theoretical_loss": 4.1024959564729055, + "tokens_seen": 328728576 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547342026078235, + "loss": 3.1631, + "theoretical_loss": 4.102401429595758, + "tokens_seen": 328794112 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045472417251755264, + "loss": 3.2726, + "theoretical_loss": 4.102306926832417, + "tokens_seen": 328859648 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004547141424272819, + "loss": 3.2823, + "theoretical_loss": 4.102212448171928, + "tokens_seen": 328925184 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454704112337011, + "loss": 3.1027, + "theoretical_loss": 4.1021179936033425, + "tokens_seen": 328990720 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045469408224674024, + "loss": 3.281, + "theoretical_loss": 4.102023563115721, + "tokens_seen": 329056256 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546840521564694, + "loss": 3.327, + "theoretical_loss": 4.10192915669813, + "tokens_seen": 329121792 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546740220661986, + "loss": 3.2447, + "theoretical_loss": 4.1018347743396415, + "tokens_seen": 329187328 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546639919759278, + "loss": 3.2314, + "theoretical_loss": 4.101740416029338, + "tokens_seen": 329252864 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.027589797973633, + "objective/train/theoretical_loss": 4.101646081756305, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.101646081756305, + "tokens_seen": 329318400 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454653961885657, + "loss": 3.1965, + "theoretical_loss": 4.101646081756305, + "tokens_seen": 329318400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045464393179538615, + "loss": 3.0304, + "theoretical_loss": 4.101551771509641, + "tokens_seen": 329383936 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546339017051154, + "loss": 2.9485, + "theoretical_loss": 4.101457485278444, + "tokens_seen": 329449472 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546238716148445, + "loss": 3.0351, + "theoretical_loss": 4.101363223051826, + "tokens_seen": 329515008 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045461384152457374, + "loss": 3.2807, + "theoretical_loss": 4.101268984818901, + "tokens_seen": 329580544 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004546038114343029, + "loss": 2.9531, + "theoretical_loss": 4.101174770568795, + "tokens_seen": 329646080 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545937813440321, + "loss": 3.2158, + "theoretical_loss": 4.1010805802906365, + "tokens_seen": 329711616 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545837512537613, + "loss": 3.1841, + "theoretical_loss": 4.100986413973564, + "tokens_seen": 329777152 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045457372116349047, + "loss": 3.1752, + "theoretical_loss": 4.100892271606721, + "tokens_seen": 329842688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045456369107321965, + "loss": 3.1767, + "theoretical_loss": 4.1007981531792606, + "tokens_seen": 329908224 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545536609829489, + "loss": 3.1611, + "theoretical_loss": 4.100704058680341, + "tokens_seen": 329973760 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454543630892678, + "loss": 3.2417, + "theoretical_loss": 4.1006099880991265, + "tokens_seen": 330039296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045453360080240725, + "loss": 3.1886, + "theoretical_loss": 4.100515941424792, + "tokens_seen": 330104832 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045452357071213643, + "loss": 3.1292, + "theoretical_loss": 4.100421918646517, + "tokens_seen": 330170368 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545135406218656, + "loss": 3.1455, + "theoretical_loss": 4.1003279197534885, + "tokens_seen": 330235904 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545035105315948, + "loss": 3.1236, + "theoretical_loss": 4.100233944734899, + "tokens_seen": 330301440 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045449348044132397, + "loss": 3.1264, + "theoretical_loss": 4.100139993579952, + "tokens_seen": 330366976 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045448345035105315, + "loss": 3.3271, + "theoretical_loss": 4.100046066277853, + "tokens_seen": 330432512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544734202607824, + "loss": 3.2425, + "theoretical_loss": 4.09995216281782, + "tokens_seen": 330498048 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544633901705115, + "loss": 3.0682, + "theoretical_loss": 4.0998582831890715, + "tokens_seen": 330563584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045445336008024075, + "loss": 3.044, + "theoretical_loss": 4.0997644273808405, + "tokens_seen": 330629120 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544433299899699, + "loss": 3.127, + "theoretical_loss": 4.09967059538236, + "tokens_seen": 330694656 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544332998996991, + "loss": 3.2439, + "theoretical_loss": 4.099576787182874, + "tokens_seen": 330760192 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544232698094283, + "loss": 3.1941, + "theoretical_loss": 4.099483002771633, + "tokens_seen": 330825728 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004544132397191575, + "loss": 3.0704, + "theoretical_loss": 4.099389242137894, + "tokens_seen": 330891264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.073639392852783, + "objective/train/theoretical_loss": 4.099295505270921, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.099295505270921, + "tokens_seen": 330956800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045440320962888666, + "loss": 3.1793, + "theoretical_loss": 4.099295505270921, + "tokens_seen": 330956800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045439317953861584, + "loss": 3.064, + "theoretical_loss": 4.099201792159985, + "tokens_seen": 331022336 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454383149448345, + "loss": 3.1013, + "theoretical_loss": 4.099108102794363, + "tokens_seen": 331087872 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045437311935807425, + "loss": 3.2513, + "theoretical_loss": 4.099014437163342, + "tokens_seen": 331153408 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543630892678034, + "loss": 3.2451, + "theoretical_loss": 4.098920795256213, + "tokens_seen": 331218944 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543530591775326, + "loss": 3.2996, + "theoretical_loss": 4.098827177062273, + "tokens_seen": 331284480 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543430290872618, + "loss": 3.3305, + "theoretical_loss": 4.098733582570831, + "tokens_seen": 331350016 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454332998996991, + "loss": 3.1613, + "theoretical_loss": 4.098640011771198, + "tokens_seen": 331415552 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045432296890672016, + "loss": 3.2703, + "theoretical_loss": 4.098546464652693, + "tokens_seen": 331481088 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045431293881644934, + "loss": 3.2185, + "theoretical_loss": 4.098452941204643, + "tokens_seen": 331546624 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004543029087261785, + "loss": 3.2497, + "theoretical_loss": 4.098359441416383, + "tokens_seen": 331612160 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045429287863590776, + "loss": 3.0283, + "theoretical_loss": 4.0982659652772515, + "tokens_seen": 331677696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542828485456369, + "loss": 3.2659, + "theoretical_loss": 4.098172512776597, + "tokens_seen": 331743232 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542728184553661, + "loss": 3.2737, + "theoretical_loss": 4.098079083903773, + "tokens_seen": 331808768 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045426278836509525, + "loss": 3.1034, + "theoretical_loss": 4.097985678648142, + "tokens_seen": 331874304 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542527582748245, + "loss": 3.1871, + "theoretical_loss": 4.09789229699907, + "tokens_seen": 331939840 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045424272818455366, + "loss": 3.2838, + "theoretical_loss": 4.097798938945933, + "tokens_seen": 332005376 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045423269809428284, + "loss": 3.0953, + "theoretical_loss": 4.097705604478112, + "tokens_seen": 332070912 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454222668004012, + "loss": 3.1509, + "theoretical_loss": 4.097612293584998, + "tokens_seen": 332136448 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004542126379137412, + "loss": 3.291, + "theoretical_loss": 4.0975190062559825, + "tokens_seen": 332201984 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045420260782347044, + "loss": 3.132, + "theoretical_loss": 4.097425742480472, + "tokens_seen": 332267520 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541925777331996, + "loss": 3.136, + "theoretical_loss": 4.097332502247873, + "tokens_seen": 332333056 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541825476429288, + "loss": 3.2548, + "theoretical_loss": 4.0972392855476025, + "tokens_seen": 332398592 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454172517552658, + "loss": 3.168, + "theoretical_loss": 4.097146092369084, + "tokens_seen": 332464128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541624874623872, + "loss": 3.0876, + "theoretical_loss": 4.097052922701746, + "tokens_seen": 332529664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.255794048309326, + "objective/train/theoretical_loss": 4.0969597765350265, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.0969597765350265, + "tokens_seen": 332595200 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045415245737211635, + "loss": 3.1154, + "theoretical_loss": 4.0969597765350265, + "tokens_seen": 332595200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541424272818456, + "loss": 3.1241, + "theoretical_loss": 4.096866653858368, + "tokens_seen": 332660736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541323971915747, + "loss": 3.113, + "theoretical_loss": 4.09677355466122, + "tokens_seen": 332726272 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045412236710130395, + "loss": 3.2644, + "theoretical_loss": 4.096680478933042, + "tokens_seen": 332791808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541123370110331, + "loss": 3.2421, + "theoretical_loss": 4.0965874266632945, + "tokens_seen": 332857344 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004541023069207623, + "loss": 3.1385, + "theoretical_loss": 4.09649439784145, + "tokens_seen": 332922880 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004540922768304915, + "loss": 3.2888, + "theoretical_loss": 4.096401392456988, + "tokens_seen": 332988416 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045408224674022067, + "loss": 3.1945, + "theoretical_loss": 4.09630841049939, + "tokens_seen": 333053952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045407221664994985, + "loss": 3.1287, + "theoretical_loss": 4.096215451958146, + "tokens_seen": 333119488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004540621865596791, + "loss": 3.0674, + "theoretical_loss": 4.096122516822757, + "tokens_seen": 333185024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004540521564694082, + "loss": 3.2587, + "theoretical_loss": 4.096029605082726, + "tokens_seen": 333250560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045404212637913745, + "loss": 3.2792, + "theoretical_loss": 4.095936716727564, + "tokens_seen": 333316096 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045403209628886663, + "loss": 3.269, + "theoretical_loss": 4.095843851746791, + "tokens_seen": 333381632 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004540220661985958, + "loss": 3.1944, + "theoretical_loss": 4.095751010129929, + "tokens_seen": 333447168 + }, + { + "epoch": 0.1, + "learning_rate": 0.000454012036108325, + "loss": 3.1355, + "theoretical_loss": 4.095658191866512, + "tokens_seen": 333512704 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045400200601805417, + "loss": 3.0684, + "theoretical_loss": 4.0955653969460775, + "tokens_seen": 333578240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045399197592778335, + "loss": 3.2824, + "theoretical_loss": 4.095472625358171, + "tokens_seen": 333643776 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539819458375126, + "loss": 3.3674, + "theoretical_loss": 4.095379877092343, + "tokens_seen": 333709312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539719157472417, + "loss": 3.0904, + "theoretical_loss": 4.095287152138154, + "tokens_seen": 333774848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045396188565697095, + "loss": 3.3028, + "theoretical_loss": 4.0951944504851685, + "tokens_seen": 333840384 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539518555667001, + "loss": 3.3596, + "theoretical_loss": 4.095101772122959, + "tokens_seen": 333905920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539418254764293, + "loss": 3.1435, + "theoretical_loss": 4.095009117041102, + "tokens_seen": 333971456 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539317953861585, + "loss": 3.1785, + "theoretical_loss": 4.094916485229186, + "tokens_seen": 334036992 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539217652958877, + "loss": 3.3569, + "theoretical_loss": 4.094823876676802, + "tokens_seen": 334102528 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045391173520561686, + "loss": 3.0185, + "theoretical_loss": 4.094731291373548, + "tokens_seen": 334168064 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.997954845428467, + "objective/train/theoretical_loss": 4.094638729309031, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.094638729309031, + "tokens_seen": 334233600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045390170511534604, + "loss": 3.1893, + "theoretical_loss": 4.094638729309031, + "tokens_seen": 334233600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004538916750250752, + "loss": 3.0752, + "theoretical_loss": 4.094546190472862, + "tokens_seen": 334299136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045388164493480445, + "loss": 3.3252, + "theoretical_loss": 4.09445367485466, + "tokens_seen": 334364672 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004538716148445336, + "loss": 3.3451, + "theoretical_loss": 4.09436118244405, + "tokens_seen": 334430208 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004538615847542628, + "loss": 3.2002, + "theoretical_loss": 4.094268713230667, + "tokens_seen": 334495744 + }, + { + "epoch": 0.1, + "learning_rate": 0.000453851554663992, + "loss": 3.1522, + "theoretical_loss": 4.094176267204148, + "tokens_seen": 334561280 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004538415245737212, + "loss": 3.0999, + "theoretical_loss": 4.094083844354137, + "tokens_seen": 334626816 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045383149448345036, + "loss": 3.1105, + "theoretical_loss": 4.093991444670289, + "tokens_seen": 334692352 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045382146439317954, + "loss": 3.122, + "theoretical_loss": 4.093899068142262, + "tokens_seen": 334757888 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004538114343029087, + "loss": 3.258, + "theoretical_loss": 4.093806714759721, + "tokens_seen": 334823424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045380140421263796, + "loss": 3.3121, + "theoretical_loss": 4.093714384512337, + "tokens_seen": 334888960 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537913741223671, + "loss": 3.2884, + "theoretical_loss": 4.093622077389791, + "tokens_seen": 334954496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537813440320963, + "loss": 2.9823, + "theoretical_loss": 4.093529793381768, + "tokens_seen": 335020032 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045377131394182545, + "loss": 3.0915, + "theoretical_loss": 4.093437532477958, + "tokens_seen": 335085568 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537612838515547, + "loss": 3.2714, + "theoretical_loss": 4.093345294668063, + "tokens_seen": 335151104 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045375125376128386, + "loss": 3.0272, + "theoretical_loss": 4.0932530799417846, + "tokens_seen": 335216640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045374122367101304, + "loss": 3.367, + "theoretical_loss": 4.093160888288837, + "tokens_seen": 335282176 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537311935807422, + "loss": 3.2001, + "theoretical_loss": 4.0930687196989375, + "tokens_seen": 335347712 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537211634904714, + "loss": 3.1246, + "theoretical_loss": 4.0929765741618125, + "tokens_seen": 335413248 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537111334002006, + "loss": 3.0592, + "theoretical_loss": 4.092884451667191, + "tokens_seen": 335478784 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004537011033099298, + "loss": 3.2414, + "theoretical_loss": 4.092792352204814, + "tokens_seen": 335544320 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045369107321965895, + "loss": 3.2062, + "theoretical_loss": 4.092700275764424, + "tokens_seen": 335609856 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004536810431293882, + "loss": 3.2414, + "theoretical_loss": 4.092608222335774, + "tokens_seen": 335675392 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045367101303911737, + "loss": 3.1316, + "theoretical_loss": 4.092516191908621, + "tokens_seen": 335740928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045366098294884655, + "loss": 3.109, + "theoretical_loss": 4.09242418447273, + "tokens_seen": 335806464 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.966989278793335, + "objective/train/theoretical_loss": 4.092332200017871, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.092332200017871, + "tokens_seen": 335872000 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045365095285857573, + "loss": 3.0924, + "theoretical_loss": 4.092332200017871, + "tokens_seen": 335872000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004536409227683049, + "loss": 3.12, + "theoretical_loss": 4.092240238533822, + "tokens_seen": 335937536 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004536308926780341, + "loss": 3.3737, + "theoretical_loss": 4.092148300010367, + "tokens_seen": 336003072 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004536208625877633, + "loss": 3.242, + "theoretical_loss": 4.092056384437297, + "tokens_seen": 336068608 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045361083249749245, + "loss": 3.2626, + "theoretical_loss": 4.091964491804409, + "tokens_seen": 336134144 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004536008024072217, + "loss": 3.2715, + "theoretical_loss": 4.091872622101506, + "tokens_seen": 336199680 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535907723169508, + "loss": 3.2977, + "theoretical_loss": 4.091780775318399, + "tokens_seen": 336265216 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045358074222668005, + "loss": 3.2198, + "theoretical_loss": 4.091688951444904, + "tokens_seen": 336330752 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045357071213640923, + "loss": 3.0062, + "theoretical_loss": 4.091597150470845, + "tokens_seen": 336396288 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535606820461384, + "loss": 3.2101, + "theoretical_loss": 4.091505372386051, + "tokens_seen": 336461824 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535506519558676, + "loss": 3.1865, + "theoretical_loss": 4.091413617180358, + "tokens_seen": 336527360 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045354062186559683, + "loss": 3.2881, + "theoretical_loss": 4.091321884843609, + "tokens_seen": 336592896 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045353059177532596, + "loss": 3.2311, + "theoretical_loss": 4.091230175365653, + "tokens_seen": 336658432 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535205616850552, + "loss": 3.0658, + "theoretical_loss": 4.0911384887363464, + "tokens_seen": 336723968 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535105315947843, + "loss": 3.1303, + "theoretical_loss": 4.091046824945551, + "tokens_seen": 336789504 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045350050150451355, + "loss": 3.1469, + "theoretical_loss": 4.0909551839831355, + "tokens_seen": 336855040 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045349047141424274, + "loss": 2.9782, + "theoretical_loss": 4.090863565838974, + "tokens_seen": 336920576 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534804413239719, + "loss": 3.1557, + "theoretical_loss": 4.090771970502948, + "tokens_seen": 336986112 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534704112337011, + "loss": 3.1858, + "theoretical_loss": 4.090680397964947, + "tokens_seen": 337051648 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534603811434303, + "loss": 3.194, + "theoretical_loss": 4.090588848214865, + "tokens_seen": 337117184 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534503510531595, + "loss": 3.2451, + "theoretical_loss": 4.0904973212426015, + "tokens_seen": 337182720 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534403209628887, + "loss": 2.951, + "theoretical_loss": 4.090405817038065, + "tokens_seen": 337248256 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534302908726179, + "loss": 3.0447, + "theoretical_loss": 4.090314335591169, + "tokens_seen": 337313792 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045342026078234706, + "loss": 3.2284, + "theoretical_loss": 4.090222876891834, + "tokens_seen": 337379328 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045341023069207624, + "loss": 3.0517, + "theoretical_loss": 4.090131440929985, + "tokens_seen": 337444864 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.5247080326080322, + "objective/train/theoretical_loss": 4.090040027695556, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.090040027695556, + "tokens_seen": 337510400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534002006018054, + "loss": 3.333, + "theoretical_loss": 4.090040027695556, + "tokens_seen": 337510400 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045339017051153465, + "loss": 3.2102, + "theoretical_loss": 4.0899486371784874, + "tokens_seen": 337575936 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004533801404212638, + "loss": 3.1607, + "theoretical_loss": 4.089857269368725, + "tokens_seen": 337641472 + }, + { + "epoch": 0.1, + "learning_rate": 0.000453370110330993, + "loss": 3.3493, + "theoretical_loss": 4.089765924256218, + "tokens_seen": 337707008 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004533600802407222, + "loss": 3.2266, + "theoretical_loss": 4.089674601830929, + "tokens_seen": 337772544 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004533500501504514, + "loss": 3.32, + "theoretical_loss": 4.08958330208282, + "tokens_seen": 337838080 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045334002006018056, + "loss": 3.0731, + "theoretical_loss": 4.089492025001864, + "tokens_seen": 337903616 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045332998996990974, + "loss": 3.2248, + "theoretical_loss": 4.089400770578038, + "tokens_seen": 337969152 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004533199598796389, + "loss": 3.362, + "theoretical_loss": 4.089309538801327, + "tokens_seen": 338034688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045330992978936816, + "loss": 3.2434, + "theoretical_loss": 4.08921832966172, + "tokens_seen": 338100224 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532998996990973, + "loss": 3.2551, + "theoretical_loss": 4.0891271431492155, + "tokens_seen": 338165760 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532898696088265, + "loss": 3.1294, + "theoretical_loss": 4.089035979253816, + "tokens_seen": 338231296 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045327983951855565, + "loss": 3.1613, + "theoretical_loss": 4.08894483796553, + "tokens_seen": 338296832 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532698094282849, + "loss": 3.1212, + "theoretical_loss": 4.0888537192743755, + "tokens_seen": 338362368 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045325977933801406, + "loss": 3.1365, + "theoretical_loss": 4.088762623170373, + "tokens_seen": 338427904 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045324974924774324, + "loss": 3.0358, + "theoretical_loss": 4.088671549643553, + "tokens_seen": 338493440 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532397191574724, + "loss": 3.2149, + "theoretical_loss": 4.088580498683948, + "tokens_seen": 338558976 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532296890672016, + "loss": 3.2108, + "theoretical_loss": 4.088489470281601, + "tokens_seen": 338624512 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004532196589769308, + "loss": 3.0998, + "theoretical_loss": 4.088398464426559, + "tokens_seen": 338690048 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045320962888666, + "loss": 3.252, + "theoretical_loss": 4.088307481108876, + "tokens_seen": 338755584 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045319959879638915, + "loss": 3.2268, + "theoretical_loss": 4.088216520318612, + "tokens_seen": 338821120 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004531895687061184, + "loss": 3.1826, + "theoretical_loss": 4.0881255820458335, + "tokens_seen": 338886656 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045317953861584757, + "loss": 3.1767, + "theoretical_loss": 4.088034666280614, + "tokens_seen": 338952192 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045316950852557675, + "loss": 3.3705, + "theoretical_loss": 4.087943773013032, + "tokens_seen": 339017728 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045315947843530593, + "loss": 3.1949, + "theoretical_loss": 4.087852902233173, + "tokens_seen": 339083264 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1497690677642822, + "objective/train/theoretical_loss": 4.087762053931129, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.087762053931129, + "tokens_seen": 339148800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004531494483450351, + "loss": 3.3468, + "theoretical_loss": 4.087762053931129, + "tokens_seen": 339148800 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004531394182547643, + "loss": 3.3627, + "theoretical_loss": 4.087671228096997, + "tokens_seen": 339214336 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004531293881644935, + "loss": 3.3478, + "theoretical_loss": 4.087580424720882, + "tokens_seen": 339279872 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045311935807422265, + "loss": 3.001, + "theoretical_loss": 4.087489643792894, + "tokens_seen": 339345408 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004531093279839519, + "loss": 3.3699, + "theoretical_loss": 4.08739888530315, + "tokens_seen": 339410944 + }, + { + "epoch": 0.1, + "learning_rate": 0.000453099297893681, + "loss": 3.1387, + "theoretical_loss": 4.087308149241774, + "tokens_seen": 339476480 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045308926780341025, + "loss": 3.213, + "theoretical_loss": 4.087217435598894, + "tokens_seen": 339542016 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045307923771313943, + "loss": 3.2869, + "theoretical_loss": 4.087126744364646, + "tokens_seen": 339607552 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004530692076228686, + "loss": 3.2006, + "theoretical_loss": 4.087036075529172, + "tokens_seen": 339673088 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004530591775325978, + "loss": 3.1232, + "theoretical_loss": 4.086945429082618, + "tokens_seen": 339738624 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045304914744232703, + "loss": 3.2702, + "theoretical_loss": 4.086854805015141, + "tokens_seen": 339804160 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045303911735205616, + "loss": 3.1844, + "theoretical_loss": 4.086764203316902, + "tokens_seen": 339869696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004530290872617854, + "loss": 3.298, + "theoretical_loss": 4.086673623978064, + "tokens_seen": 339935232 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004530190571715145, + "loss": 3.1776, + "theoretical_loss": 4.086583066988802, + "tokens_seen": 340000768 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045300902708124375, + "loss": 3.2362, + "theoretical_loss": 4.086492532339296, + "tokens_seen": 340066304 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045299899699097294, + "loss": 3.2234, + "theoretical_loss": 4.0864020200197295, + "tokens_seen": 340131840 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529889669007021, + "loss": 3.2213, + "theoretical_loss": 4.086311530020296, + "tokens_seen": 340197376 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529789368104313, + "loss": 3.1171, + "theoretical_loss": 4.086221062331192, + "tokens_seen": 340262912 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529689067201605, + "loss": 3.2377, + "theoretical_loss": 4.086130616942621, + "tokens_seen": 340328448 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045295887662988966, + "loss": 3.0288, + "theoretical_loss": 4.086040193844794, + "tokens_seen": 340393984 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529488465396189, + "loss": 3.0465, + "theoretical_loss": 4.085949793027927, + "tokens_seen": 340459520 + }, + { + "epoch": 0.1, + "learning_rate": 0.000452938816449348, + "loss": 3.1687, + "theoretical_loss": 4.0858594144822415, + "tokens_seen": 340525056 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045292878635907726, + "loss": 2.9973, + "theoretical_loss": 4.085769058197968, + "tokens_seen": 340590592 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529187562688064, + "loss": 3.0832, + "theoretical_loss": 4.085678724165341, + "tokens_seen": 340656128 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004529087261785356, + "loss": 3.2571, + "theoretical_loss": 4.0855884123746, + "tokens_seen": 340721664 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 537861, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3161091804504395, + "objective/train/theoretical_loss": 4.085498122815992, + "objective/train/tokens_used": 341687776, + "theoretical_loss": 4.085498122815992, + "tokens_seen": 340787200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528986960882648, + "loss": 3.1963, + "theoretical_loss": 4.085498122815992, + "tokens_seen": 340787200 + }, + { + "epoch": 0.1, + "learning_rate": 0.000452888665997994, + "loss": 3.1352, + "theoretical_loss": 4.085407855479772, + "tokens_seen": 340852736 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045287863590772316, + "loss": 3.2215, + "theoretical_loss": 4.085317610356199, + "tokens_seen": 340918272 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528686058174524, + "loss": 3.2187, + "theoretical_loss": 4.085227387435538, + "tokens_seen": 340983808 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528585757271815, + "loss": 3.1171, + "theoretical_loss": 4.08513718670806, + "tokens_seen": 341049344 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045284854563691076, + "loss": 3.2336, + "theoretical_loss": 4.085047008164044, + "tokens_seen": 341114880 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528385155466399, + "loss": 3.1677, + "theoretical_loss": 4.084956851793773, + "tokens_seen": 341180416 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528284854563691, + "loss": 3.2435, + "theoretical_loss": 4.0848667175875395, + "tokens_seen": 341245952 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528184553660983, + "loss": 3.0311, + "theoretical_loss": 4.0847766055356365, + "tokens_seen": 341311488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004528084252758275, + "loss": 3.0619, + "theoretical_loss": 4.084686515628368, + "tokens_seen": 341377024 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045279839518555667, + "loss": 3.1238, + "theoretical_loss": 4.0845964478560415, + "tokens_seen": 341442560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045278836509528585, + "loss": 3.2286, + "theoretical_loss": 4.084506402208972, + "tokens_seen": 341508096 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045277833500501503, + "loss": 3.0867, + "theoretical_loss": 4.0844163786774805, + "tokens_seen": 341573632 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045276830491474426, + "loss": 3.2731, + "theoretical_loss": 4.084326377251894, + "tokens_seen": 341639168 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004527582748244734, + "loss": 3.0764, + "theoretical_loss": 4.084236397922544, + "tokens_seen": 341704704 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004527482447342026, + "loss": 3.9806, + "theoretical_loss": 4.084142224475771, + "tokens_seen": 341773312 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045273821464393175, + "loss": 3.291, + "theoretical_loss": 4.084052290344537, + "tokens_seen": 341838848 + }, + { + "epoch": 1.0, + "learning_rate": 0.000452728184553661, + "loss": 3.1445, + "theoretical_loss": 4.0839623782801215, + "tokens_seen": 341904384 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045271815446339017, + "loss": 3.2967, + "theoretical_loss": 4.083872488272884, + "tokens_seen": 341969920 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045270812437311935, + "loss": 3.2796, + "theoretical_loss": 4.083782620313186, + "tokens_seen": 342035456 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526980942828486, + "loss": 3.1324, + "theoretical_loss": 4.083692774391398, + "tokens_seen": 342100992 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045268806419257777, + "loss": 3.2578, + "theoretical_loss": 4.083602950497896, + "tokens_seen": 342166528 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045267803410230695, + "loss": 3.2028, + "theoretical_loss": 4.0835131486230605, + "tokens_seen": 342232064 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045266800401203613, + "loss": 3.1388, + "theoretical_loss": 4.0834233687572805, + "tokens_seen": 342297600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526579739217653, + "loss": 3.1808, + "theoretical_loss": 4.083333610890947, + "tokens_seen": 342363136 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 572633, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3064844608306885, + "objective/train/theoretical_loss": 4.08326630692256, + "objective/train/tokens_used": 362872288, + "theoretical_loss": 4.08326630692256, + "tokens_seen": 342412288 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526479438314945, + "loss": 3.2414, + "theoretical_loss": 4.083243875014463, + "tokens_seen": 342428672 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526379137412237, + "loss": 3.2582, + "theoretical_loss": 4.083154161118232, + "tokens_seen": 342494208 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045262788365095285, + "loss": 3.3248, + "theoretical_loss": 4.083064469192665, + "tokens_seen": 342559744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526178535606821, + "loss": 3.3183, + "theoretical_loss": 4.082974799228182, + "tokens_seen": 342625280 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004526078234704112, + "loss": 3.3606, + "theoretical_loss": 4.082885151215207, + "tokens_seen": 342690816 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045259779338014045, + "loss": 3.1971, + "theoretical_loss": 4.082795525144167, + "tokens_seen": 342756352 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045258776328986963, + "loss": 3.1323, + "theoretical_loss": 4.082705921005499, + "tokens_seen": 342821888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004525777331995988, + "loss": 3.3008, + "theoretical_loss": 4.082616338789646, + "tokens_seen": 342887424 + }, + { + "epoch": 1.0, + "learning_rate": 0.000452567703109328, + "loss": 3.2123, + "theoretical_loss": 4.082526778487054, + "tokens_seen": 342952960 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045255767301905723, + "loss": 3.2882, + "theoretical_loss": 4.082437240088177, + "tokens_seen": 343018496 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045254764292878636, + "loss": 3.1492, + "theoretical_loss": 4.082347723583476, + "tokens_seen": 343084032 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004525376128385156, + "loss": 3.3149, + "theoretical_loss": 4.082258228963416, + "tokens_seen": 343149568 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004525275827482447, + "loss": 3.2678, + "theoretical_loss": 4.082168756218468, + "tokens_seen": 343215104 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045251755265797395, + "loss": 3.2092, + "theoretical_loss": 4.08207930533911, + "tokens_seen": 343280640 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045250752256770314, + "loss": 3.2386, + "theoretical_loss": 4.081989876315825, + "tokens_seen": 343346176 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524974924774323, + "loss": 3.1725, + "theoretical_loss": 4.0819004691391045, + "tokens_seen": 343411712 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524874623871615, + "loss": 3.3375, + "theoretical_loss": 4.081811083799442, + "tokens_seen": 343477248 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524774322968907, + "loss": 3.199, + "theoretical_loss": 4.081721720287339, + "tokens_seen": 343542784 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045246740220661986, + "loss": 3.0421, + "theoretical_loss": 4.081632378593305, + "tokens_seen": 343608320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524573721163491, + "loss": 3.1324, + "theoretical_loss": 4.081543058707851, + "tokens_seen": 343673856 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524473420260782, + "loss": 3.3068, + "theoretical_loss": 4.081453760621496, + "tokens_seen": 343739392 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045243731193580746, + "loss": 3.2572, + "theoretical_loss": 4.081364484324768, + "tokens_seen": 343804928 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524272818455366, + "loss": 3.0494, + "theoretical_loss": 4.081275229808195, + "tokens_seen": 343870464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004524172517552658, + "loss": 3.077, + "theoretical_loss": 4.081185997062316, + "tokens_seen": 343936000 + }, + { + "epoch": 1.0, + "learning_rate": 0.000452407221664995, + "loss": 3.3085, + "theoretical_loss": 4.081096786077674, + "tokens_seen": 344001536 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 574101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.306347608566284, + "objective/train/theoretical_loss": 4.0810298921143175, + "objective/train/tokens_used": 364510688, + "theoretical_loss": 4.0810298921143175, + "tokens_seen": 344050688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523971915747242, + "loss": 3.3008, + "theoretical_loss": 4.081007596844816, + "tokens_seen": 344067072 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045238716148445336, + "loss": 3.188, + "theoretical_loss": 4.080918429354298, + "tokens_seen": 344132608 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523771313941826, + "loss": 3.4139, + "theoretical_loss": 4.080829283596681, + "tokens_seen": 344198144 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523671013039117, + "loss": 3.0986, + "theoretical_loss": 4.080740159562531, + "tokens_seen": 344263680 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045235707121364096, + "loss": 3.1828, + "theoretical_loss": 4.08065105724242, + "tokens_seen": 344329216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523470411233701, + "loss": 3.4081, + "theoretical_loss": 4.080561976626927, + "tokens_seen": 344394752 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523370110330993, + "loss": 3.2177, + "theoretical_loss": 4.080472917706636, + "tokens_seen": 344460288 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523269809428285, + "loss": 3.2651, + "theoretical_loss": 4.080383880472137, + "tokens_seen": 344525824 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004523169508525577, + "loss": 3.1222, + "theoretical_loss": 4.080294864914026, + "tokens_seen": 344591360 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045230692076228687, + "loss": 3.214, + "theoretical_loss": 4.080205871022905, + "tokens_seen": 344656896 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045229689067201605, + "loss": 3.3717, + "theoretical_loss": 4.080116898789382, + "tokens_seen": 344722432 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045228686058174523, + "loss": 3.1445, + "theoretical_loss": 4.080027948204069, + "tokens_seen": 344787968 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045227683049147446, + "loss": 3.2551, + "theoretical_loss": 4.079939019257587, + "tokens_seen": 344853504 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004522668004012036, + "loss": 3.134, + "theoretical_loss": 4.07985011194056, + "tokens_seen": 344919040 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004522567703109328, + "loss": 3.2116, + "theoretical_loss": 4.079761226243621, + "tokens_seen": 344984576 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045224674022066195, + "loss": 2.955, + "theoretical_loss": 4.079672362157404, + "tokens_seen": 345050112 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004522367101303912, + "loss": 3.2684, + "theoretical_loss": 4.079583519672554, + "tokens_seen": 345115648 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045222668004012037, + "loss": 3.1194, + "theoretical_loss": 4.079494698779719, + "tokens_seen": 345181184 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045221664994984955, + "loss": 3.3988, + "theoretical_loss": 4.079405899469553, + "tokens_seen": 345246720 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045220661985957873, + "loss": 3.2104, + "theoretical_loss": 4.079317121732716, + "tokens_seen": 345312256 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045219658976930797, + "loss": 3.2978, + "theoretical_loss": 4.0792283655598744, + "tokens_seen": 345377792 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004521865596790371, + "loss": 3.2044, + "theoretical_loss": 4.079139630941701, + "tokens_seen": 345443328 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045217652958876633, + "loss": 3.1591, + "theoretical_loss": 4.079050917868872, + "tokens_seen": 345508864 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045216649949849546, + "loss": 3.2841, + "theoretical_loss": 4.078962226332071, + "tokens_seen": 345574400 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004521564694082247, + "loss": 3.1327, + "theoretical_loss": 4.078873556321988, + "tokens_seen": 345639936 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 577024, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.098017454147339, + "objective/train/theoretical_loss": 4.078807067935736, + "objective/train/tokens_used": 366149088, + "theoretical_loss": 4.078807067935736, + "tokens_seen": 345689088 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045214643931795387, + "loss": 3.089, + "theoretical_loss": 4.078784907829317, + "tokens_seen": 345705472 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045213640922768305, + "loss": 3.2762, + "theoretical_loss": 4.07869628084476, + "tokens_seen": 345771008 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045212637913741223, + "loss": 3.2983, + "theoretical_loss": 4.078607675359023, + "tokens_seen": 345836544 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004521163490471414, + "loss": 3.0904, + "theoretical_loss": 4.078519091362818, + "tokens_seen": 345902080 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004521063189568706, + "loss": 3.0779, + "theoretical_loss": 4.078430528846862, + "tokens_seen": 345967616 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045209628886659983, + "loss": 3.0312, + "theoretical_loss": 4.078341987801882, + "tokens_seen": 346033152 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045208625877632896, + "loss": 3.1953, + "theoretical_loss": 4.078253468218605, + "tokens_seen": 346098688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004520762286860582, + "loss": 3.3981, + "theoretical_loss": 4.078164970087768, + "tokens_seen": 346164224 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004520661985957873, + "loss": 3.1148, + "theoretical_loss": 4.078076493400111, + "tokens_seen": 346229760 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045205616850551656, + "loss": 3.312, + "theoretical_loss": 4.0779880381463816, + "tokens_seen": 346295296 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045204613841524574, + "loss": 3.2997, + "theoretical_loss": 4.077899604317332, + "tokens_seen": 346360832 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004520361083249749, + "loss": 3.2301, + "theoretical_loss": 4.077811191903721, + "tokens_seen": 346426368 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004520260782347041, + "loss": 3.0401, + "theoretical_loss": 4.0777228008963124, + "tokens_seen": 346491904 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045201604814443334, + "loss": 3.1987, + "theoretical_loss": 4.077634431285876, + "tokens_seen": 346557440 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045200601805416246, + "loss": 3.0769, + "theoretical_loss": 4.077546083063188, + "tokens_seen": 346622976 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519959879638917, + "loss": 3.2191, + "theoretical_loss": 4.077457756219029, + "tokens_seen": 346688512 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519859578736208, + "loss": 3.3277, + "theoretical_loss": 4.077369450744186, + "tokens_seen": 346754048 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045197592778335006, + "loss": 3.0852, + "theoretical_loss": 4.077281166629453, + "tokens_seen": 346819584 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045196589769307924, + "loss": 3.2108, + "theoretical_loss": 4.077192903865626, + "tokens_seen": 346885120 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519558676028084, + "loss": 3.3431, + "theoretical_loss": 4.077104662443512, + "tokens_seen": 346950656 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045194583751253766, + "loss": 3.1752, + "theoretical_loss": 4.0770164423539175, + "tokens_seen": 347016192 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519358074222668, + "loss": 3.2471, + "theoretical_loss": 4.076928243587662, + "tokens_seen": 347081728 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451925777331996, + "loss": 3.243, + "theoretical_loss": 4.0768400661355635, + "tokens_seen": 347147264 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519157472417252, + "loss": 3.1235, + "theoretical_loss": 4.07675190998845, + "tokens_seen": 347212800 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004519057171514544, + "loss": 3.2284, + "theoretical_loss": 4.0766637751371535, + "tokens_seen": 347278336 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 580707, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.894477367401123, + "objective/train/theoretical_loss": 4.07659768796855, + "objective/train/tokens_used": 367787488, + "theoretical_loss": 4.07659768796855, + "tokens_seen": 347327488 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045189568706118356, + "loss": 3.2538, + "theoretical_loss": 4.076575661572513, + "tokens_seen": 347343872 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518856569709128, + "loss": 3.1059, + "theoretical_loss": 4.076487569285373, + "tokens_seen": 347409408 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518756268806419, + "loss": 3.2101, + "theoretical_loss": 4.076399498266582, + "tokens_seen": 347474944 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045186559679037116, + "loss": 3.1852, + "theoretical_loss": 4.076311448506995, + "tokens_seen": 347540480 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518555667001003, + "loss": 3.1323, + "theoretical_loss": 4.076223419997474, + "tokens_seen": 347606016 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518455366098295, + "loss": 3.1391, + "theoretical_loss": 4.076135412728885, + "tokens_seen": 347671552 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518355065195587, + "loss": 3.2539, + "theoretical_loss": 4.0760474266920985, + "tokens_seen": 347737088 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004518254764292879, + "loss": 3.2101, + "theoretical_loss": 4.0759594618779955, + "tokens_seen": 347802624 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045181544633901707, + "loss": 3.2618, + "theoretical_loss": 4.075871518277458, + "tokens_seen": 347868160 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045180541624874625, + "loss": 3.3358, + "theoretical_loss": 4.075783595881374, + "tokens_seen": 347933696 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045179538615847543, + "loss": 3.2259, + "theoretical_loss": 4.07569569468064, + "tokens_seen": 347999232 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045178535606820466, + "loss": 3.1088, + "theoretical_loss": 4.075607814666155, + "tokens_seen": 348064768 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004517753259779338, + "loss": 3.1283, + "theoretical_loss": 4.075519955828825, + "tokens_seen": 348130304 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451765295887663, + "loss": 3.2042, + "theoretical_loss": 4.075432118159563, + "tokens_seen": 348195840 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045175526579739215, + "loss": 3.2357, + "theoretical_loss": 4.075344301649285, + "tokens_seen": 348261376 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004517452357071214, + "loss": 3.2162, + "theoretical_loss": 4.075256506288914, + "tokens_seen": 348326912 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045173520561685057, + "loss": 3.2484, + "theoretical_loss": 4.075168732069379, + "tokens_seen": 348392448 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045172517552657975, + "loss": 3.2664, + "theoretical_loss": 4.075080978981614, + "tokens_seen": 348457984 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045171514543630893, + "loss": 3.3059, + "theoretical_loss": 4.0749932470165575, + "tokens_seen": 348523520 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045170511534603817, + "loss": 3.1508, + "theoretical_loss": 4.074905536165154, + "tokens_seen": 348589056 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004516950852557673, + "loss": 3.2376, + "theoretical_loss": 4.074817846418357, + "tokens_seen": 348654592 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045168505516549653, + "loss": 3.1055, + "theoretical_loss": 4.074730177767121, + "tokens_seen": 348720128 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045167502507522566, + "loss": 3.1319, + "theoretical_loss": 4.074642530202409, + "tokens_seen": 348785664 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004516649949849549, + "loss": 3.1451, + "theoretical_loss": 4.074554903715186, + "tokens_seen": 348851200 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045165496489468407, + "loss": 3.0726, + "theoretical_loss": 4.074467298296429, + "tokens_seen": 348916736 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 582101, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.327352285385132, + "objective/train/theoretical_loss": 4.074401608053112, + "objective/train/tokens_used": 369425888, + "theoretical_loss": 4.074401608053112, + "tokens_seen": 348965888 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045164493480441325, + "loss": 3.1956, + "theoretical_loss": 4.0743797139371125, + "tokens_seen": 348982272 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045163490471414244, + "loss": 3.278, + "theoretical_loss": 4.0742921506282235, + "tokens_seen": 349047808 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004516248746238716, + "loss": 3.2567, + "theoretical_loss": 4.07420460836075, + "tokens_seen": 349113344 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004516148445336008, + "loss": 3.2037, + "theoretical_loss": 4.074117087125689, + "tokens_seen": 349178880 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045160481444333003, + "loss": 2.8811, + "theoretical_loss": 4.07402958691404, + "tokens_seen": 349244416 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045159478435305916, + "loss": 3.3009, + "theoretical_loss": 4.073942107716809, + "tokens_seen": 349309952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004515847542627884, + "loss": 3.1826, + "theoretical_loss": 4.07385464952501, + "tokens_seen": 349375488 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004515747241725175, + "loss": 3.2202, + "theoretical_loss": 4.073767212329658, + "tokens_seen": 349441024 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045156469408224676, + "loss": 3.1651, + "theoretical_loss": 4.073679796121777, + "tokens_seen": 349506560 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045155466399197594, + "loss": 3.2191, + "theoretical_loss": 4.073592400892395, + "tokens_seen": 349572096 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004515446339017051, + "loss": 3.1769, + "theoretical_loss": 4.073505026632548, + "tokens_seen": 349637632 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004515346038114343, + "loss": 3.2118, + "theoretical_loss": 4.073417673333272, + "tokens_seen": 349703168 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045152457372116354, + "loss": 3.2707, + "theoretical_loss": 4.073330340985614, + "tokens_seen": 349768704 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045151454363089266, + "loss": 3.1634, + "theoretical_loss": 4.073243029580625, + "tokens_seen": 349834240 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004515045135406219, + "loss": 3.1584, + "theoretical_loss": 4.073155739109359, + "tokens_seen": 349899776 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451494483450351, + "loss": 3.2707, + "theoretical_loss": 4.07306846956288, + "tokens_seen": 349965312 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045148445336008026, + "loss": 3.2289, + "theoretical_loss": 4.072981220932253, + "tokens_seen": 350030848 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045147442326980944, + "loss": 3.2645, + "theoretical_loss": 4.0728939932085515, + "tokens_seen": 350096384 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004514643931795386, + "loss": 3.2133, + "theoretical_loss": 4.072806786382853, + "tokens_seen": 350161920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004514543630892678, + "loss": 3.2158, + "theoretical_loss": 4.072719600446241, + "tokens_seen": 350227456 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451444332998997, + "loss": 3.1266, + "theoretical_loss": 4.072632435389805, + "tokens_seen": 350292992 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045143430290872617, + "loss": 3.3611, + "theoretical_loss": 4.072545291204638, + "tokens_seen": 350358528 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004514242728184554, + "loss": 3.165, + "theoretical_loss": 4.072458167881841, + "tokens_seen": 350424064 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045141424272818453, + "loss": 3.224, + "theoretical_loss": 4.0723710654125185, + "tokens_seen": 350489600 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045140421263791376, + "loss": 3.2151, + "theoretical_loss": 4.072283983787782, + "tokens_seen": 350555136 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 585026, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.312256097793579, + "objective/train/theoretical_loss": 4.0722186862431435, + "objective/train/tokens_used": 371064288, + "theoretical_loss": 4.0722186862431435, + "tokens_seen": 350604288 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045139418254764294, + "loss": 3.1982, + "theoretical_loss": 4.072196922998746, + "tokens_seen": 350620672 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004513841524573721, + "loss": 3.0946, + "theoretical_loss": 4.072109883036535, + "tokens_seen": 350686208 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004513741223671013, + "loss": 3.0547, + "theoretical_loss": 4.072022863892274, + "tokens_seen": 350751744 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004513640922768305, + "loss": 3.2009, + "theoretical_loss": 4.071935865557095, + "tokens_seen": 350817280 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045135406218655967, + "loss": 3.2359, + "theoretical_loss": 4.071848888022138, + "tokens_seen": 350882816 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004513440320962889, + "loss": 3.3058, + "theoretical_loss": 4.0717619312785445, + "tokens_seen": 350948352 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045133400200601803, + "loss": 3.1876, + "theoretical_loss": 4.071674995317464, + "tokens_seen": 351013888 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045132397191574727, + "loss": 3.2414, + "theoretical_loss": 4.071588080130051, + "tokens_seen": 351079424 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004513139418254764, + "loss": 3.0741, + "theoretical_loss": 4.071501185707465, + "tokens_seen": 351144960 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045130391173520563, + "loss": 3.0883, + "theoretical_loss": 4.071414312040871, + "tokens_seen": 351210496 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004512938816449348, + "loss": 3.2598, + "theoretical_loss": 4.07132745912144, + "tokens_seen": 351276032 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451283851554664, + "loss": 3.3161, + "theoretical_loss": 4.071240626940346, + "tokens_seen": 351341568 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045127382146439317, + "loss": 3.1431, + "theoretical_loss": 4.071153815488772, + "tokens_seen": 351407104 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045126379137412235, + "loss": 3.1095, + "theoretical_loss": 4.0710670247579035, + "tokens_seen": 351472640 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045125376128385153, + "loss": 3.1876, + "theoretical_loss": 4.070980254738934, + "tokens_seen": 351538176 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045124373119358077, + "loss": 3.3457, + "theoretical_loss": 4.07089350542306, + "tokens_seen": 351603712 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004512337011033099, + "loss": 3.3096, + "theoretical_loss": 4.070806776801484, + "tokens_seen": 351669248 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045122367101303913, + "loss": 3.1891, + "theoretical_loss": 4.070720068865414, + "tokens_seen": 351734784 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004512136409227683, + "loss": 3.3205, + "theoretical_loss": 4.070633381606065, + "tokens_seen": 351800320 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004512036108324975, + "loss": 3.2134, + "theoretical_loss": 4.070546715014654, + "tokens_seen": 351865856 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045119358074222673, + "loss": 3.2348, + "theoretical_loss": 4.070460069082406, + "tokens_seen": 351931392 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045118355065195586, + "loss": 3.0626, + "theoretical_loss": 4.070373443800552, + "tokens_seen": 351996928 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004511735205616851, + "loss": 3.234, + "theoretical_loss": 4.070286839160325, + "tokens_seen": 352062464 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004511634904714143, + "loss": 3.2684, + "theoretical_loss": 4.070200255152967, + "tokens_seen": 352128000 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045115346038114345, + "loss": 3.3108, + "theoretical_loss": 4.070113691769722, + "tokens_seen": 352193536 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 588030, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7068357467651367, + "objective/train/theoretical_loss": 4.070048782761599, + "objective/train/tokens_used": 372702688, + "theoretical_loss": 4.070048782761599, + "tokens_seen": 352242688 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045114343029087264, + "loss": 3.1326, + "theoretical_loss": 4.070027149001842, + "tokens_seen": 352259072 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004511334002006018, + "loss": 3.2432, + "theoretical_loss": 4.069940626840584, + "tokens_seen": 352324608 + }, + { + "epoch": 1.0, + "learning_rate": 0.000451123370110331, + "loss": 3.2969, + "theoretical_loss": 4.069854125277208, + "tokens_seen": 352390144 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045111334002006023, + "loss": 3.1461, + "theoretical_loss": 4.069767644302982, + "tokens_seen": 352455680 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045110330992978936, + "loss": 3.1506, + "theoretical_loss": 4.069681183909179, + "tokens_seen": 352521216 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510932798395186, + "loss": 3.4042, + "theoretical_loss": 4.069594744087076, + "tokens_seen": 352586752 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510832497492477, + "loss": 3.1749, + "theoretical_loss": 4.0695083248279555, + "tokens_seen": 352652288 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045107321965897696, + "loss": 3.0914, + "theoretical_loss": 4.069421926123106, + "tokens_seen": 352717824 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045106318956870614, + "loss": 3.1858, + "theoretical_loss": 4.0693355479638225, + "tokens_seen": 352783360 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510531594784353, + "loss": 3.2901, + "theoretical_loss": 4.069249190341402, + "tokens_seen": 352848896 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510431293881645, + "loss": 3.3559, + "theoretical_loss": 4.06916285324715, + "tokens_seen": 352914432 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045103309929789374, + "loss": 3.1892, + "theoretical_loss": 4.069076536672376, + "tokens_seen": 352979968 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045102306920762286, + "loss": 3.1782, + "theoretical_loss": 4.068990240608394, + "tokens_seen": 353045504 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510130391173521, + "loss": 3.1809, + "theoretical_loss": 4.068903965046524, + "tokens_seen": 353111040 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004510030090270812, + "loss": 3.2242, + "theoretical_loss": 4.0688177099780924, + "tokens_seen": 353176576 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045099297893681046, + "loss": 3.1829, + "theoretical_loss": 4.068731475394429, + "tokens_seen": 353242112 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045098294884653964, + "loss": 3.227, + "theoretical_loss": 4.06864526128687, + "tokens_seen": 353307648 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004509729187562688, + "loss": 3.1809, + "theoretical_loss": 4.068559067646758, + "tokens_seen": 353373184 + }, + { + "epoch": 1.0, + "learning_rate": 0.000450962888665998, + "loss": 3.2348, + "theoretical_loss": 4.068472894465437, + "tokens_seen": 353438720 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004509528585757272, + "loss": 3.1091, + "theoretical_loss": 4.06838674173426, + "tokens_seen": 353504256 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045094282848545637, + "loss": 3.1172, + "theoretical_loss": 4.0683006094445835, + "tokens_seen": 353569792 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004509327983951856, + "loss": 3.2622, + "theoretical_loss": 4.068214497587771, + "tokens_seen": 353635328 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045092276830491473, + "loss": 3.1955, + "theoretical_loss": 4.068128406155188, + "tokens_seen": 353700864 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045091273821464396, + "loss": 3.0562, + "theoretical_loss": 4.068042335138209, + "tokens_seen": 353766400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045090270812437314, + "loss": 3.1372, + "theoretical_loss": 4.067956284528211, + "tokens_seen": 353831936 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 591016, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.173988103866577, + "objective/train/theoretical_loss": 4.067891759957611, + "objective/train/tokens_used": 374341088, + "theoretical_loss": 4.067891759957611, + "tokens_seen": 353881088 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004508926780341023, + "loss": 3.2636, + "theoretical_loss": 4.067870254316579, + "tokens_seen": 353897472 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004508826479438315, + "loss": 3.2112, + "theoretical_loss": 4.067784244494698, + "tokens_seen": 353963008 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004508726178535607, + "loss": 3.2224, + "theoretical_loss": 4.067698255053965, + "tokens_seen": 354028544 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045086258776328987, + "loss": 3.3535, + "theoretical_loss": 4.067612285985777, + "tokens_seen": 354094080 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004508525576730191, + "loss": 3.1912, + "theoretical_loss": 4.067526337281539, + "tokens_seen": 354159616 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045084252758274823, + "loss": 3.159, + "theoretical_loss": 4.06744040893266, + "tokens_seen": 354225152 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045083249749247747, + "loss": 3.1745, + "theoretical_loss": 4.067354500930554, + "tokens_seen": 354290688 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004508224674022066, + "loss": 3.2419, + "theoretical_loss": 4.067268613266641, + "tokens_seen": 354356224 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045081243731193583, + "loss": 3.3106, + "theoretical_loss": 4.067182745932348, + "tokens_seen": 354421760 + }, + { + "epoch": 1.0, + "learning_rate": 0.000450802407221665, + "loss": 3.1903, + "theoretical_loss": 4.067096898919102, + "tokens_seen": 354487296 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004507923771313942, + "loss": 3.327, + "theoretical_loss": 4.0670110722183415, + "tokens_seen": 354552832 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045078234704112337, + "loss": 3.2427, + "theoretical_loss": 4.066925265821504, + "tokens_seen": 354618368 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045077231695085255, + "loss": 3.1788, + "theoretical_loss": 4.066839479720038, + "tokens_seen": 354683904 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045076228686058173, + "loss": 3.046, + "theoretical_loss": 4.066753713905392, + "tokens_seen": 354749440 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045075225677031097, + "loss": 3.0597, + "theoretical_loss": 4.066667968369023, + "tokens_seen": 354814976 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004507422266800401, + "loss": 3.3341, + "theoretical_loss": 4.066582243102394, + "tokens_seen": 354880512 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045073219658976933, + "loss": 3.1262, + "theoretical_loss": 4.066496538096969, + "tokens_seen": 354946048 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004507221664994985, + "loss": 3.1987, + "theoretical_loss": 4.06641085334422, + "tokens_seen": 355011584 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004507121364092277, + "loss": 3.2917, + "theoretical_loss": 4.066325188835625, + "tokens_seen": 355077120 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004507021063189569, + "loss": 3.2667, + "theoretical_loss": 4.066239544562666, + "tokens_seen": 355142656 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045069207622868606, + "loss": 3.2767, + "theoretical_loss": 4.066153920516828, + "tokens_seen": 355208192 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045068204613841524, + "loss": 3.0723, + "theoretical_loss": 4.066068316689606, + "tokens_seen": 355273728 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004506720160481445, + "loss": 3.2575, + "theoretical_loss": 4.065982733072495, + "tokens_seen": 355339264 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004506619859578736, + "loss": 3.2903, + "theoretical_loss": 4.065897169656999, + "tokens_seen": 355404800 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045065195586760284, + "loss": 3.2077, + "theoretical_loss": 4.065811626434625, + "tokens_seen": 355470336 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 592871, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.252450466156006, + "objective/train/theoretical_loss": 4.065747482264476, + "objective/train/tokens_used": 375979488, + "theoretical_loss": 4.065747482264476, + "tokens_seen": 355519488 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045064192577733196, + "loss": 3.2067, + "theoretical_loss": 4.065726103396887, + "tokens_seen": 355535872 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004506318956870612, + "loss": 3.2239, + "theoretical_loss": 4.065640600535302, + "tokens_seen": 355601408 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004506218655967904, + "loss": 3.1752, + "theoretical_loss": 4.0655551178413925, + "tokens_seen": 355666944 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045061183550651956, + "loss": 3.2137, + "theoretical_loss": 4.065469655306687, + "tokens_seen": 355732480 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045060180541624874, + "loss": 3.2895, + "theoretical_loss": 4.065384212922719, + "tokens_seen": 355798016 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505917753259779, + "loss": 3.2247, + "theoretical_loss": 4.0652987906810285, + "tokens_seen": 355863552 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505817452357071, + "loss": 3.1652, + "theoretical_loss": 4.065213388573157, + "tokens_seen": 355929088 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045057171514543634, + "loss": 3.1803, + "theoretical_loss": 4.065128006590653, + "tokens_seen": 355994624 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045056168505516547, + "loss": 2.98, + "theoretical_loss": 4.065042644725072, + "tokens_seen": 356060160 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505516549648947, + "loss": 3.071, + "theoretical_loss": 4.064957302967971, + "tokens_seen": 356125696 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505416248746239, + "loss": 3.3663, + "theoretical_loss": 4.064871981310915, + "tokens_seen": 356191232 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045053159478435306, + "loss": 3.113, + "theoretical_loss": 4.064786679745474, + "tokens_seen": 356256768 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045052156469408224, + "loss": 3.1587, + "theoretical_loss": 4.064701398263219, + "tokens_seen": 356322304 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505115346038114, + "loss": 3.3652, + "theoretical_loss": 4.064616136855733, + "tokens_seen": 356387840 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004505015045135406, + "loss": 3.191, + "theoretical_loss": 4.064530895514597, + "tokens_seen": 356453376 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045049147442326984, + "loss": 3.1612, + "theoretical_loss": 4.064445674231402, + "tokens_seen": 356518912 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045048144433299897, + "loss": 3.3313, + "theoretical_loss": 4.064360472997743, + "tokens_seen": 356584448 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004504714142427282, + "loss": 3.0972, + "theoretical_loss": 4.0642752918052185, + "tokens_seen": 356649984 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045046138415245733, + "loss": 3.3341, + "theoretical_loss": 4.064190130645432, + "tokens_seen": 356715520 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045045135406218657, + "loss": 3.141, + "theoretical_loss": 4.064104989509996, + "tokens_seen": 356781056 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004504413239719158, + "loss": 3.3333, + "theoretical_loss": 4.064019868390522, + "tokens_seen": 356846592 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045043129388164493, + "loss": 3.3108, + "theoretical_loss": 4.063934767278632, + "tokens_seen": 356912128 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045042126379137416, + "loss": 3.2488, + "theoretical_loss": 4.06384968616595, + "tokens_seen": 356977664 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045041123370110334, + "loss": 3.446, + "theoretical_loss": 4.063764625044106, + "tokens_seen": 357043200 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004504012036108325, + "loss": 3.2119, + "theoretical_loss": 4.063679583904735, + "tokens_seen": 357108736 + }, + { + "epoch": 1.0, + "objective/train/docs_used": 595649, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.348710060119629, + "objective/train/theoretical_loss": 4.063615816158674, + "objective/train/tokens_used": 377617888, + "theoretical_loss": 4.063615816158674, + "tokens_seen": 357157888 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503911735205617, + "loss": 3.1224, + "theoretical_loss": 4.063594562739476, + "tokens_seen": 357174272 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503811434302909, + "loss": 3.076, + "theoretical_loss": 4.063509561539974, + "tokens_seen": 357239808 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045037111334002007, + "loss": 3.1912, + "theoretical_loss": 4.06342458029788, + "tokens_seen": 357305344 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503610832497493, + "loss": 3.1781, + "theoretical_loss": 4.063339619004848, + "tokens_seen": 357370880 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045035105315947843, + "loss": 3.2177, + "theoretical_loss": 4.063254677652539, + "tokens_seen": 357436416 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045034102306920767, + "loss": 3.177, + "theoretical_loss": 4.063169756232616, + "tokens_seen": 357501952 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503309929789368, + "loss": 3.1839, + "theoretical_loss": 4.0630848547367515, + "tokens_seen": 357567488 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045032096288866603, + "loss": 3.3351, + "theoretical_loss": 4.062999973156619, + "tokens_seen": 357633024 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503109327983952, + "loss": 3.1785, + "theoretical_loss": 4.062915111483899, + "tokens_seen": 357698560 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004503009027081244, + "loss": 3.0904, + "theoretical_loss": 4.062830269710275, + "tokens_seen": 357764096 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045029087261785357, + "loss": 3.2194, + "theoretical_loss": 4.0627454478274405, + "tokens_seen": 357829632 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045028084252758275, + "loss": 3.2744, + "theoretical_loss": 4.062660645827087, + "tokens_seen": 357895168 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045027081243731193, + "loss": 3.2529, + "theoretical_loss": 4.062575863700916, + "tokens_seen": 357960704 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045026078234704117, + "loss": 3.2062, + "theoretical_loss": 4.062491101440633, + "tokens_seen": 358026240 + }, + { + "epoch": 1.0, + "learning_rate": 0.0004502507522567703, + "loss": 3.2737, + "theoretical_loss": 4.062406359037947, + "tokens_seen": 358091776 + }, + { + "epoch": 1.0, + "learning_rate": 0.00045024072216649953, + "loss": 3.1081, + "theoretical_loss": 4.0623216364845725, + "tokens_seen": 358157312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004502306920762287, + "loss": 3.178, + "theoretical_loss": 4.0622369337722315, + "tokens_seen": 358222848 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004502206619859579, + "loss": 3.2489, + "theoretical_loss": 4.062152250892646, + "tokens_seen": 358288384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004502106318956871, + "loss": 3.081, + "theoretical_loss": 4.062067587837548, + "tokens_seen": 358353920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045020060180541626, + "loss": 3.2381, + "theoretical_loss": 4.061982944598672, + "tokens_seen": 358419456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045019057171514544, + "loss": 3.0833, + "theoretical_loss": 4.061898321167757, + "tokens_seen": 358484992 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004501805416248747, + "loss": 3.1337, + "theoretical_loss": 4.061813717536548, + "tokens_seen": 358550528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004501705115346038, + "loss": 3.2097, + "theoretical_loss": 4.061729133696795, + "tokens_seen": 358616064 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045016048144433304, + "loss": 3.2319, + "theoretical_loss": 4.061644569640252, + "tokens_seen": 358681600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045015045135406216, + "loss": 3.1189, + "theoretical_loss": 4.061560025358679, + "tokens_seen": 358747136 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 598409, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.941607713699341, + "objective/train/theoretical_loss": 4.061496630119868, + "objective/train/tokens_used": 379256288, + "theoretical_loss": 4.061496630119868, + "tokens_seen": 358796288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004501404212637914, + "loss": 3.0646, + "theoretical_loss": 4.06147550084384, + "tokens_seen": 358812672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004501303911735206, + "loss": 3.3479, + "theoretical_loss": 4.061390996087504, + "tokens_seen": 358878208 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045012036108324976, + "loss": 3.2583, + "theoretical_loss": 4.0613065110814475, + "tokens_seen": 358943744 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045011033099297894, + "loss": 3.0394, + "theoretical_loss": 4.0612220458174475, + "tokens_seen": 359009280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004501003009027081, + "loss": 3.1247, + "theoretical_loss": 4.06113760028729, + "tokens_seen": 359074816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004500902708124373, + "loss": 3.288, + "theoretical_loss": 4.061053174482762, + "tokens_seen": 359140352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045008024072216654, + "loss": 3.2188, + "theoretical_loss": 4.060968768395659, + "tokens_seen": 359205888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045007021063189567, + "loss": 3.1713, + "theoretical_loss": 4.060884382017779, + "tokens_seen": 359271424 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004500601805416249, + "loss": 3.2265, + "theoretical_loss": 4.060800015340927, + "tokens_seen": 359336960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004500501504513541, + "loss": 3.3442, + "theoretical_loss": 4.060715668356911, + "tokens_seen": 359402496 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045004012036108326, + "loss": 3.2555, + "theoretical_loss": 4.060631341057545, + "tokens_seen": 359468032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045003009027081244, + "loss": 3.2806, + "theoretical_loss": 4.060547033434647, + "tokens_seen": 359533568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004500200601805416, + "loss": 3.2568, + "theoretical_loss": 4.060462745480041, + "tokens_seen": 359599104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004500100300902708, + "loss": 3.0718, + "theoretical_loss": 4.060378477185554, + "tokens_seen": 359664640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00045000000000000004, + "loss": 3.2618, + "theoretical_loss": 4.060294228543021, + "tokens_seen": 359730176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044998996990972917, + "loss": 3.218, + "theoretical_loss": 4.060209999544279, + "tokens_seen": 359795712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004499799398194584, + "loss": 3.0192, + "theoretical_loss": 4.060125790181171, + "tokens_seen": 359861248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044996990972918753, + "loss": 3.3024, + "theoretical_loss": 4.060041600445546, + "tokens_seen": 359926784 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044995987963891677, + "loss": 3.331, + "theoretical_loss": 4.059957430329254, + "tokens_seen": 359992320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044994984954864595, + "loss": 3.1361, + "theoretical_loss": 4.0598732798241555, + "tokens_seen": 360057856 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044993981945837513, + "loss": 3.2329, + "theoretical_loss": 4.059789148922111, + "tokens_seen": 360123392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004499297893681043, + "loss": 3.1152, + "theoretical_loss": 4.059705037614989, + "tokens_seen": 360188928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044991975927783355, + "loss": 3.0446, + "theoretical_loss": 4.059620945894661, + "tokens_seen": 360254464 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044990972918756267, + "loss": 3.0942, + "theoretical_loss": 4.059536873753004, + "tokens_seen": 360320000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004498996990972919, + "loss": 3.1938, + "theoretical_loss": 4.059452821181899, + "tokens_seen": 360385536 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 601352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.438528299331665, + "objective/train/theoretical_loss": 4.059389794591865, + "objective/train/tokens_used": 380894688, + "theoretical_loss": 4.059389794591865, + "tokens_seen": 360434688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044988966900702103, + "loss": 3.217, + "theoretical_loss": 4.059368788173233, + "tokens_seen": 360451072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044987963891675027, + "loss": 3.3111, + "theoretical_loss": 4.0592847747189, + "tokens_seen": 360516608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044986960882647945, + "loss": 3.0609, + "theoretical_loss": 4.059200780810793, + "tokens_seen": 360582144 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044985957873620863, + "loss": 3.2434, + "theoretical_loss": 4.059116806440814, + "tokens_seen": 360647680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004498495486459378, + "loss": 3.1724, + "theoretical_loss": 4.05903285160087, + "tokens_seen": 360713216 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449839518555667, + "loss": 3.1892, + "theoretical_loss": 4.058948916282871, + "tokens_seen": 360778752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004498294884653962, + "loss": 3.2399, + "theoretical_loss": 4.058865000478733, + "tokens_seen": 360844288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004498194583751254, + "loss": 3.277, + "theoretical_loss": 4.058781104180377, + "tokens_seen": 360909824 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044980942828485454, + "loss": 3.3747, + "theoretical_loss": 4.058697227379726, + "tokens_seen": 360975360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044979939819458377, + "loss": 3.2875, + "theoretical_loss": 4.058613370068713, + "tokens_seen": 361040896 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497893681043129, + "loss": 3.2381, + "theoretical_loss": 4.058529532239271, + "tokens_seen": 361106432 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044977933801404213, + "loss": 3.2313, + "theoretical_loss": 4.05844571388334, + "tokens_seen": 361171968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497693079237713, + "loss": 3.064, + "theoretical_loss": 4.058361914992865, + "tokens_seen": 361237504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497592778335005, + "loss": 3.0737, + "theoretical_loss": 4.058278135559794, + "tokens_seen": 361303040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497492477432297, + "loss": 3.2866, + "theoretical_loss": 4.0581943755760825, + "tokens_seen": 361368576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497392176529589, + "loss": 3.2402, + "theoretical_loss": 4.058110635033689, + "tokens_seen": 361434112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044972918756268804, + "loss": 3.2523, + "theoretical_loss": 4.058026913924576, + "tokens_seen": 361499648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497191574724173, + "loss": 3.2836, + "theoretical_loss": 4.057943212240713, + "tokens_seen": 361565184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004497091273821464, + "loss": 2.9739, + "theoretical_loss": 4.057859529974073, + "tokens_seen": 361630720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044969909729187564, + "loss": 3.0945, + "theoretical_loss": 4.057775867116634, + "tokens_seen": 361696256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004496890672016049, + "loss": 3.2114, + "theoretical_loss": 4.0576922236603785, + "tokens_seen": 361761792 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449679037111334, + "loss": 3.2615, + "theoretical_loss": 4.057608599597294, + "tokens_seen": 361827328 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044966900702106324, + "loss": 3.1307, + "theoretical_loss": 4.057524994919372, + "tokens_seen": 361892864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044965897693079236, + "loss": 3.2148, + "theoretical_loss": 4.05744140961861, + "tokens_seen": 361958400 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004496489468405216, + "loss": 3.0576, + "theoretical_loss": 4.0573578436870115, + "tokens_seen": 362023936 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 604190, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.138101577758789, + "objective/train/theoretical_loss": 4.057295181944515, + "objective/train/tokens_used": 382533088, + "theoretical_loss": 4.057295181944515, + "tokens_seen": 362073088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004496389167502508, + "loss": 3.3261, + "theoretical_loss": 4.05727429711658, + "tokens_seen": 362089472 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044962888665997996, + "loss": 3.1794, + "theoretical_loss": 4.057190769899329, + "tokens_seen": 362155008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044961885656970914, + "loss": 2.9849, + "theoretical_loss": 4.057107262027273, + "tokens_seen": 362220544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004496088264794383, + "loss": 3.1661, + "theoretical_loss": 4.057023773492434, + "tokens_seen": 362286080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004495987963891675, + "loss": 2.9568, + "theoretical_loss": 4.056940304286836, + "tokens_seen": 362351616 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044958876629889674, + "loss": 3.0898, + "theoretical_loss": 4.056856854402509, + "tokens_seen": 362417152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044957873620862587, + "loss": 3.1548, + "theoretical_loss": 4.0567734238314905, + "tokens_seen": 362482688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004495687061183551, + "loss": 3.2966, + "theoretical_loss": 4.056690012565818, + "tokens_seen": 362548224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004495586760280843, + "loss": 3.2302, + "theoretical_loss": 4.056606620597536, + "tokens_seen": 362613760 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044954864593781346, + "loss": 3.149, + "theoretical_loss": 4.056523247918694, + "tokens_seen": 362679296 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044953861584754264, + "loss": 3.2413, + "theoretical_loss": 4.056439894521345, + "tokens_seen": 362744832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004495285857572718, + "loss": 3.3302, + "theoretical_loss": 4.056356560397549, + "tokens_seen": 362810368 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449518555667001, + "loss": 3.1144, + "theoretical_loss": 4.0562732455393675, + "tokens_seen": 362875904 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044950852557673024, + "loss": 3.1846, + "theoretical_loss": 4.05618994993887, + "tokens_seen": 362941440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044949849548645937, + "loss": 3.2322, + "theoretical_loss": 4.056106673588127, + "tokens_seen": 363006976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004494884653961886, + "loss": 3.2993, + "theoretical_loss": 4.056023416479217, + "tokens_seen": 363072512 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044947843530591773, + "loss": 3.2952, + "theoretical_loss": 4.055940178604223, + "tokens_seen": 363138048 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044946840521564697, + "loss": 3.0621, + "theoretical_loss": 4.05585695995523, + "tokens_seen": 363203584 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044945837512537615, + "loss": 3.2152, + "theoretical_loss": 4.05577376052433, + "tokens_seen": 363269120 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044944834503510533, + "loss": 3.126, + "theoretical_loss": 4.055690580303619, + "tokens_seen": 363334656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004494383149448345, + "loss": 2.9744, + "theoretical_loss": 4.055607419285197, + "tokens_seen": 363400192 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044942828485456375, + "loss": 3.2556, + "theoretical_loss": 4.05552427746117, + "tokens_seen": 363465728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044941825476429287, + "loss": 3.1169, + "theoretical_loss": 4.055441154823648, + "tokens_seen": 363531264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004494082246740221, + "loss": 3.3264, + "theoretical_loss": 4.055358051364745, + "tokens_seen": 363596800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044939819458375123, + "loss": 3.3475, + "theoretical_loss": 4.055274967076583, + "tokens_seen": 363662336 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 606755, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.006503105163574, + "objective/train/theoretical_loss": 4.055212666436519, + "objective/train/tokens_used": 384171488, + "theoretical_loss": 4.055212666436519, + "tokens_seen": 363711488 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044938816449348047, + "loss": 3.2275, + "theoretical_loss": 4.055191901951282, + "tokens_seen": 363727872 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044937813440320965, + "loss": 3.1454, + "theoretical_loss": 4.055108855980974, + "tokens_seen": 363793408 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044936810431293883, + "loss": 3.0974, + "theoretical_loss": 4.05502582915779, + "tokens_seen": 363858944 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449358074222668, + "loss": 3.2241, + "theoretical_loss": 4.05494282147387, + "tokens_seen": 363924480 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004493480441323972, + "loss": 3.1579, + "theoretical_loss": 4.0548598329213545, + "tokens_seen": 363990016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004493380140421264, + "loss": 3.322, + "theoretical_loss": 4.054776863492393, + "tokens_seen": 364055552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004493279839518556, + "loss": 3.233, + "theoretical_loss": 4.054693913179135, + "tokens_seen": 364121088 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044931795386158474, + "loss": 3.0093, + "theoretical_loss": 4.054610981973738, + "tokens_seen": 364186624 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044930792377131397, + "loss": 3.2013, + "theoretical_loss": 4.054528069868365, + "tokens_seen": 364252160 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492978936810431, + "loss": 3.4086, + "theoretical_loss": 4.054445176855179, + "tokens_seen": 364317696 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044928786359077234, + "loss": 3.2905, + "theoretical_loss": 4.054362302926351, + "tokens_seen": 364383232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492778335005015, + "loss": 3.334, + "theoretical_loss": 4.054279448074057, + "tokens_seen": 364448768 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492678034102307, + "loss": 3.2852, + "theoretical_loss": 4.054196612290476, + "tokens_seen": 364514304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492577733199599, + "loss": 3.2355, + "theoretical_loss": 4.054113795567792, + "tokens_seen": 364579840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492477432296891, + "loss": 3.1429, + "theoretical_loss": 4.054030997898195, + "tokens_seen": 364645376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044923771313941824, + "loss": 3.1198, + "theoretical_loss": 4.053948219273877, + "tokens_seen": 364710912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492276830491475, + "loss": 3.2963, + "theoretical_loss": 4.053865459687037, + "tokens_seen": 364776448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004492176529588766, + "loss": 3.4295, + "theoretical_loss": 4.053782719129877, + "tokens_seen": 364841984 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044920762286860584, + "loss": 3.0987, + "theoretical_loss": 4.053699997594605, + "tokens_seen": 364907520 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449197592778335, + "loss": 3.279, + "theoretical_loss": 4.053617295073432, + "tokens_seen": 364973056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004491875626880642, + "loss": 3.3488, + "theoretical_loss": 4.053534611558575, + "tokens_seen": 365038592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004491775325977934, + "loss": 3.2159, + "theoretical_loss": 4.053451947042255, + "tokens_seen": 365104128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044916750250752256, + "loss": 2.9275, + "theoretical_loss": 4.053369301516697, + "tokens_seen": 365169664 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044915747241725174, + "loss": 3.2273, + "theoretical_loss": 4.053286674974132, + "tokens_seen": 365235200 + }, + { + "epoch": 1.01, + "learning_rate": 0.000449147442326981, + "loss": 3.0753, + "theoretical_loss": 4.053204067406793, + "tokens_seen": 365300736 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 608178, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1334376335144043, + "objective/train/theoretical_loss": 4.053142124179114, + "objective/train/tokens_used": 385809888, + "theoretical_loss": 4.053142124179114, + "tokens_seen": 365349888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004491374122367101, + "loss": 3.2298, + "theoretical_loss": 4.053121478806922, + "tokens_seen": 365366272 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044912738214643934, + "loss": 3.1976, + "theoretical_loss": 4.0530389091667605, + "tokens_seen": 365431808 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044911735205616847, + "loss": 3.0537, + "theoretical_loss": 4.052956358478558, + "tokens_seen": 365497344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004491073219658977, + "loss": 3.3107, + "theoretical_loss": 4.052873826734567, + "tokens_seen": 365562880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004490972918756269, + "loss": 3.2454, + "theoretical_loss": 4.052791313927045, + "tokens_seen": 365628416 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044908726178535607, + "loss": 3.1872, + "theoretical_loss": 4.052708820048256, + "tokens_seen": 365693952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044907723169508525, + "loss": 3.3036, + "theoretical_loss": 4.052626345090464, + "tokens_seen": 365759488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004490672016048145, + "loss": 3.1293, + "theoretical_loss": 4.052543889045941, + "tokens_seen": 365825024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004490571715145436, + "loss": 3.1902, + "theoretical_loss": 4.052461451906963, + "tokens_seen": 365890560 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044904714142427284, + "loss": 3.0576, + "theoretical_loss": 4.05237903366581, + "tokens_seen": 365956096 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044903711133400197, + "loss": 3.2025, + "theoretical_loss": 4.052296634314767, + "tokens_seen": 366021632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004490270812437312, + "loss": 3.0898, + "theoretical_loss": 4.052214253846124, + "tokens_seen": 366087168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004490170511534604, + "loss": 3.3255, + "theoretical_loss": 4.052131892252174, + "tokens_seen": 366152704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044900702106318957, + "loss": 3.1188, + "theoretical_loss": 4.052049549525214, + "tokens_seen": 366218240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044899699097291875, + "loss": 3.2819, + "theoretical_loss": 4.05196722565755, + "tokens_seen": 366283776 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044898696088264793, + "loss": 3.2147, + "theoretical_loss": 4.051884920641487, + "tokens_seen": 366349312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004489769307923771, + "loss": 3.2837, + "theoretical_loss": 4.051802634469338, + "tokens_seen": 366414848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044896690070210635, + "loss": 3.1821, + "theoretical_loss": 4.051720367133419, + "tokens_seen": 366480384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004489568706118355, + "loss": 3.2999, + "theoretical_loss": 4.051638118626052, + "tokens_seen": 366545920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004489468405215647, + "loss": 3.0909, + "theoretical_loss": 4.05155588893956, + "tokens_seen": 366611456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044893681043129395, + "loss": 3.0975, + "theoretical_loss": 4.051473678066275, + "tokens_seen": 366676992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044892678034102307, + "loss": 3.0522, + "theoretical_loss": 4.051391485998531, + "tokens_seen": 366742528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004489167502507523, + "loss": 3.18, + "theoretical_loss": 4.051309312728667, + "tokens_seen": 366808064 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044890672016048143, + "loss": 3.21, + "theoretical_loss": 4.051227158249025, + "tokens_seen": 366873600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044889669007021067, + "loss": 3.3125, + "theoretical_loss": 4.051145022551956, + "tokens_seen": 366939136 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 610875, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.328371047973633, + "objective/train/theoretical_loss": 4.051083433100615, + "objective/train/tokens_used": 387448288, + "theoretical_loss": 4.051083433100615, + "tokens_seen": 366988288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044888665997993985, + "loss": 3.1364, + "theoretical_loss": 4.051062905629809, + "tokens_seen": 367004672 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044887662988966903, + "loss": 3.0864, + "theoretical_loss": 4.050980807474944, + "tokens_seen": 367070208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488665997993982, + "loss": 3.332, + "theoretical_loss": 4.050898728079719, + "tokens_seen": 367135744 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488565697091274, + "loss": 3.1484, + "theoretical_loss": 4.050816667436502, + "tokens_seen": 367201280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488465396188566, + "loss": 3.2478, + "theoretical_loss": 4.050734625537663, + "tokens_seen": 367266816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488365095285858, + "loss": 3.2475, + "theoretical_loss": 4.0506526023755764, + "tokens_seen": 367332352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044882647943831494, + "loss": 3.2698, + "theoretical_loss": 4.050570597942622, + "tokens_seen": 367397888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488164493480442, + "loss": 3.3009, + "theoretical_loss": 4.050488612231183, + "tokens_seen": 367463424 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004488064192577733, + "loss": 3.1686, + "theoretical_loss": 4.050406645233647, + "tokens_seen": 367528960 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044879638916750254, + "loss": 3.0826, + "theoretical_loss": 4.050324696942407, + "tokens_seen": 367594496 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487863590772317, + "loss": 3.3018, + "theoretical_loss": 4.05024276734986, + "tokens_seen": 367660032 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487763289869609, + "loss": 3.3329, + "theoretical_loss": 4.050160856448408, + "tokens_seen": 367725568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487662988966901, + "loss": 3.1462, + "theoretical_loss": 4.050078964230456, + "tokens_seen": 367791104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487562688064193, + "loss": 3.2288, + "theoretical_loss": 4.049997090688415, + "tokens_seen": 367856640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044874623871614844, + "loss": 3.1699, + "theoretical_loss": 4.049915235814701, + "tokens_seen": 367922176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487362086258777, + "loss": 3.2048, + "theoretical_loss": 4.04983339960173, + "tokens_seen": 367987712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487261785356068, + "loss": 3.2109, + "theoretical_loss": 4.049751582041928, + "tokens_seen": 368053248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044871614844533604, + "loss": 3.4461, + "theoretical_loss": 4.049669783127722, + "tokens_seen": 368118784 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004487061183550652, + "loss": 3.0677, + "theoretical_loss": 4.049588002851546, + "tokens_seen": 368184320 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486960882647944, + "loss": 3.2429, + "theoretical_loss": 4.049506241205835, + "tokens_seen": 368249856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486860581745236, + "loss": 3.069, + "theoretical_loss": 4.049424498183031, + "tokens_seen": 368315392 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044867602808425276, + "loss": 3.2233, + "theoretical_loss": 4.04934277377558, + "tokens_seen": 368380928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044866599799398194, + "loss": 3.1754, + "theoretical_loss": 4.049261067975932, + "tokens_seen": 368446464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486559679037112, + "loss": 3.2397, + "theoretical_loss": 4.049179380776542, + "tokens_seen": 368512000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486459378134403, + "loss": 3.1143, + "theoretical_loss": 4.049097712169869, + "tokens_seen": 368577536 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 613523, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.219022750854492, + "objective/train/theoretical_loss": 4.0490364729118, + "objective/train/tokens_used": 389086688, + "theoretical_loss": 4.0490364729118, + "tokens_seen": 368626688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044863590772316954, + "loss": 3.0595, + "theoretical_loss": 4.049016062148374, + "tokens_seen": 368643072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044862587763289867, + "loss": 3.2046, + "theoretical_loss": 4.048934430704529, + "tokens_seen": 368708608 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486158475426279, + "loss": 3.2381, + "theoretical_loss": 4.048852817830801, + "tokens_seen": 368774144 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004486058174523571, + "loss": 3.3309, + "theoretical_loss": 4.048771223519671, + "tokens_seen": 368839680 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044859578736208627, + "loss": 3.0516, + "theoretical_loss": 4.048689647763618, + "tokens_seen": 368905216 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044858575727181545, + "loss": 2.9922, + "theoretical_loss": 4.048608090555127, + "tokens_seen": 368970752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004485757271815447, + "loss": 3.186, + "theoretical_loss": 4.048526551886687, + "tokens_seen": 369036288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004485656970912738, + "loss": 3.0352, + "theoretical_loss": 4.048445031750795, + "tokens_seen": 369101824 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044855566700100304, + "loss": 3.1318, + "theoretical_loss": 4.048363530139945, + "tokens_seen": 369167360 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044854563691073217, + "loss": 2.976, + "theoretical_loss": 4.048282047046644, + "tokens_seen": 369232896 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004485356068204614, + "loss": 2.9555, + "theoretical_loss": 4.048200582463396, + "tokens_seen": 369298432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004485255767301906, + "loss": 3.2057, + "theoretical_loss": 4.048119136382715, + "tokens_seen": 369363968 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044851554663991977, + "loss": 3.063, + "theoretical_loss": 4.048037708797115, + "tokens_seen": 369429504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044850551654964895, + "loss": 3.0624, + "theoretical_loss": 4.047956299699117, + "tokens_seen": 369495040 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044849548645937813, + "loss": 3.2876, + "theoretical_loss": 4.047874909081245, + "tokens_seen": 369560576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004484854563691073, + "loss": 3.1584, + "theoretical_loss": 4.0477935369360285, + "tokens_seen": 369626112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044847542627883655, + "loss": 3.3316, + "theoretical_loss": 4.047712183256, + "tokens_seen": 369691648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004484653961885657, + "loss": 3.1998, + "theoretical_loss": 4.047630848033698, + "tokens_seen": 369757184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004484553660982949, + "loss": 3.2247, + "theoretical_loss": 4.047549531261664, + "tokens_seen": 369822720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044844533600802404, + "loss": 3.0955, + "theoretical_loss": 4.047468232932444, + "tokens_seen": 369888256 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044843530591775327, + "loss": 3.2701, + "theoretical_loss": 4.04738695303859, + "tokens_seen": 369953792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044842527582748245, + "loss": 3.1516, + "theoretical_loss": 4.047305691572654, + "tokens_seen": 370019328 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044841524573721163, + "loss": 3.1769, + "theoretical_loss": 4.0472244485271975, + "tokens_seen": 370084864 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004484052156469408, + "loss": 3.145, + "theoretical_loss": 4.047143223894784, + "tokens_seen": 370150400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044839518555667005, + "loss": 3.0602, + "theoretical_loss": 4.047062017667981, + "tokens_seen": 370215936 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 616213, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.181370735168457, + "objective/train/theoretical_loss": 4.047001125072091, + "objective/train/tokens_used": 390725088, + "theoretical_loss": 4.047001125072091, + "tokens_seen": 370265088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483851554663992, + "loss": 3.0964, + "theoretical_loss": 4.04698082983936, + "tokens_seen": 370281472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483751253761284, + "loss": 3.1254, + "theoretical_loss": 4.046899660401499, + "tokens_seen": 370347008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044836509528585754, + "loss": 3.1774, + "theoretical_loss": 4.046818509346977, + "tokens_seen": 370412544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483550651955868, + "loss": 3.0603, + "theoretical_loss": 4.04673737666838, + "tokens_seen": 370478080 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044834503510531596, + "loss": 3.2412, + "theoretical_loss": 4.046656262358297, + "tokens_seen": 370543616 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044833500501504514, + "loss": 3.2405, + "theoretical_loss": 4.0465751664093235, + "tokens_seen": 370609152 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483249749247743, + "loss": 3.0053, + "theoretical_loss": 4.046494088814056, + "tokens_seen": 370674688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483149448345035, + "loss": 3.1446, + "theoretical_loss": 4.046413029565096, + "tokens_seen": 370740224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004483049147442327, + "loss": 3.0005, + "theoretical_loss": 4.0463319886550515, + "tokens_seen": 370805760 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482948846539619, + "loss": 3.1358, + "theoretical_loss": 4.046250966076533, + "tokens_seen": 370871296 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044828485456369104, + "loss": 3.0399, + "theoretical_loss": 4.046169961822156, + "tokens_seen": 370936832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482748244734203, + "loss": 3.1566, + "theoretical_loss": 4.0460889758845395, + "tokens_seen": 371002368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482647943831494, + "loss": 3.118, + "theoretical_loss": 4.046008008256307, + "tokens_seen": 371067904 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044825476429287864, + "loss": 3.2326, + "theoretical_loss": 4.045927058930086, + "tokens_seen": 371133440 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482447342026078, + "loss": 3.1339, + "theoretical_loss": 4.045846127898511, + "tokens_seen": 371198976 + }, + { + "epoch": 1.01, + "learning_rate": 0.000448234704112337, + "loss": 3.1678, + "theoretical_loss": 4.0457652151542165, + "tokens_seen": 371264512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482246740220662, + "loss": 3.2508, + "theoretical_loss": 4.045684320689844, + "tokens_seen": 371330048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004482146439317954, + "loss": 3.354, + "theoretical_loss": 4.045603444498037, + "tokens_seen": 371395584 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044820461384152455, + "loss": 3.0726, + "theoretical_loss": 4.0455225865714475, + "tokens_seen": 371461120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004481945837512538, + "loss": 3.1423, + "theoretical_loss": 4.0454417469027275, + "tokens_seen": 371526656 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044818455366098296, + "loss": 2.9992, + "theoretical_loss": 4.045360925484535, + "tokens_seen": 371592192 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044817452357071214, + "loss": 3.1144, + "theoretical_loss": 4.045280122309532, + "tokens_seen": 371657728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004481644934804414, + "loss": 3.1227, + "theoretical_loss": 4.045199337370385, + "tokens_seen": 371723264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004481544633901705, + "loss": 3.1041, + "theoretical_loss": 4.045118570659764, + "tokens_seen": 371788800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044814443329989974, + "loss": 3.3218, + "theoretical_loss": 4.045037822170345, + "tokens_seen": 371854336 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 619201, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.73344349861145, + "objective/train/theoretical_loss": 4.044977272756539, + "objective/train/tokens_used": 392363488, + "theoretical_loss": 4.044977272756539, + "tokens_seen": 371903488 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044813440320962887, + "loss": 3.131, + "theoretical_loss": 4.044957091894806, + "tokens_seen": 371919872 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004481243731193581, + "loss": 3.2507, + "theoretical_loss": 4.04487637982583, + "tokens_seen": 371985408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004481143430290873, + "loss": 3.1635, + "theoretical_loss": 4.044795685956105, + "tokens_seen": 372050944 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044810431293881647, + "loss": 3.3104, + "theoretical_loss": 4.044715010278322, + "tokens_seen": 372116480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044809428284854565, + "loss": 3.303, + "theoretical_loss": 4.044634352785179, + "tokens_seen": 372182016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004480842527582749, + "loss": 3.2744, + "theoretical_loss": 4.044553713469373, + "tokens_seen": 372247552 + }, + { + "epoch": 1.01, + "learning_rate": 0.000448074222668004, + "loss": 3.0619, + "theoretical_loss": 4.044473092323611, + "tokens_seen": 372313088 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044806419257773324, + "loss": 3.2262, + "theoretical_loss": 4.0443924893406, + "tokens_seen": 372378624 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044805416248746237, + "loss": 3.2775, + "theoretical_loss": 4.044311904513054, + "tokens_seen": 372444160 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004480441323971916, + "loss": 3.1768, + "theoretical_loss": 4.044231337833689, + "tokens_seen": 372509696 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004480341023069208, + "loss": 3.2928, + "theoretical_loss": 4.044150789295227, + "tokens_seen": 372575232 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044802407221664997, + "loss": 3.196, + "theoretical_loss": 4.044070258890391, + "tokens_seen": 372640768 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044801404212637915, + "loss": 3.0005, + "theoretical_loss": 4.0439897466119135, + "tokens_seen": 372706304 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044800401203610833, + "loss": 3.0641, + "theoretical_loss": 4.0439092524525275, + "tokens_seen": 372771840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004479939819458375, + "loss": 3.11, + "theoretical_loss": 4.04382877640497, + "tokens_seen": 372837376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044798395185556675, + "loss": 3.0772, + "theoretical_loss": 4.043748318461985, + "tokens_seen": 372902912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004479739217652959, + "loss": 3.2673, + "theoretical_loss": 4.043667878616316, + "tokens_seen": 372968448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004479638916750251, + "loss": 3.1364, + "theoretical_loss": 4.043587456860715, + "tokens_seen": 373033984 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044795386158475424, + "loss": 3.2477, + "theoretical_loss": 4.043507053187938, + "tokens_seen": 373099520 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044794383149448347, + "loss": 3.3117, + "theoretical_loss": 4.043426667590741, + "tokens_seen": 373165056 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044793380140421265, + "loss": 3.3089, + "theoretical_loss": 4.04334630006189, + "tokens_seen": 373230592 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044792377131394183, + "loss": 2.9454, + "theoretical_loss": 4.04326595059415, + "tokens_seen": 373296128 + }, + { + "epoch": 1.01, + "learning_rate": 0.000447913741223671, + "loss": 3.2037, + "theoretical_loss": 4.043185619180294, + "tokens_seen": 373361664 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044790371113340025, + "loss": 3.186, + "theoretical_loss": 4.0431053058130955, + "tokens_seen": 373427200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478936810431294, + "loss": 3.3693, + "theoretical_loss": 4.043025010485336, + "tokens_seen": 373492736 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 620463, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2226457595825195, + "objective/train/theoretical_loss": 4.042964800823556, + "objective/train/tokens_used": 394001888, + "theoretical_loss": 4.042964800823556, + "tokens_seen": 373541888 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478836509528586, + "loss": 3.2989, + "theoretical_loss": 4.042944733189799, + "tokens_seen": 373558272 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044787362086258774, + "loss": 3.3737, + "theoretical_loss": 4.042864473919272, + "tokens_seen": 373623808 + }, + { + "epoch": 1.01, + "learning_rate": 0.000447863590772317, + "loss": 3.0648, + "theoretical_loss": 4.042784232666547, + "tokens_seen": 373689344 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044785356068204616, + "loss": 3.3706, + "theoretical_loss": 4.04270400942442, + "tokens_seen": 373754880 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044784353059177534, + "loss": 3.1387, + "theoretical_loss": 4.042623804185692, + "tokens_seen": 373820416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478335005015045, + "loss": 3.2366, + "theoretical_loss": 4.042543616943168, + "tokens_seen": 373885952 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478234704112337, + "loss": 3.2295, + "theoretical_loss": 4.042463447689657, + "tokens_seen": 373951488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478134403209629, + "loss": 3.139, + "theoretical_loss": 4.042383296417969, + "tokens_seen": 374017024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004478034102306921, + "loss": 3.2526, + "theoretical_loss": 4.042303163120925, + "tokens_seen": 374082560 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044779338014042124, + "loss": 2.9804, + "theoretical_loss": 4.042223047791343, + "tokens_seen": 374148096 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004477833500501505, + "loss": 2.974, + "theoretical_loss": 4.04214295042205, + "tokens_seen": 374213632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004477733199598796, + "loss": 3.0646, + "theoretical_loss": 4.042062871005874, + "tokens_seen": 374279168 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044776328986960884, + "loss": 3.1427, + "theoretical_loss": 4.041982809535649, + "tokens_seen": 374344704 + }, + { + "epoch": 1.01, + "learning_rate": 0.000447753259779338, + "loss": 3.2936, + "theoretical_loss": 4.041902766004213, + "tokens_seen": 374410240 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004477432296890672, + "loss": 3.2687, + "theoretical_loss": 4.041822740404407, + "tokens_seen": 374475776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004477331995987964, + "loss": 3.1485, + "theoretical_loss": 4.041742732729078, + "tokens_seen": 374541312 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004477231695085256, + "loss": 3.2163, + "theoretical_loss": 4.041662742971074, + "tokens_seen": 374606848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044771313941825475, + "loss": 3.1984, + "theoretical_loss": 4.04158277112325, + "tokens_seen": 374672384 + }, + { + "epoch": 1.01, + "learning_rate": 0.000447703109327984, + "loss": 3.3133, + "theoretical_loss": 4.041502817178464, + "tokens_seen": 374737920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476930792377131, + "loss": 3.2447, + "theoretical_loss": 4.041422881129579, + "tokens_seen": 374803456 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044768304914744234, + "loss": 3.0292, + "theoretical_loss": 4.041342962969459, + "tokens_seen": 374868992 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476730190571715, + "loss": 3.2776, + "theoretical_loss": 4.041263062690978, + "tokens_seen": 374934528 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476629889669007, + "loss": 3.2159, + "theoretical_loss": 4.041183180287007, + "tokens_seen": 375000064 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476529588766299, + "loss": 3.1455, + "theoretical_loss": 4.0411033157504255, + "tokens_seen": 375065600 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044764292878635907, + "loss": 3.3078, + "theoretical_loss": 4.041023469074117, + "tokens_seen": 375131136 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 623485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.134404182434082, + "objective/train/theoretical_loss": 4.04096359578341, + "objective/train/tokens_used": 395640288, + "theoretical_loss": 4.04096359578341, + "tokens_seen": 375180288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044763289869608825, + "loss": 3.1696, + "theoretical_loss": 4.040943640250967, + "tokens_seen": 375196672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476228686058175, + "loss": 3.2487, + "theoretical_loss": 4.040863829273868, + "tokens_seen": 375262208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004476128385155466, + "loss": 3.2153, + "theoretical_loss": 4.0407840361357135, + "tokens_seen": 375327744 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044760280842527585, + "loss": 3.1896, + "theoretical_loss": 4.040704260829403, + "tokens_seen": 375393280 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044759277833500503, + "loss": 3.1262, + "theoretical_loss": 4.040624503347839, + "tokens_seen": 375458816 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004475827482447342, + "loss": 3.0499, + "theoretical_loss": 4.040544763683929, + "tokens_seen": 375524352 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004475727181544634, + "loss": 3.1321, + "theoretical_loss": 4.040465041830583, + "tokens_seen": 375589888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044756268806419257, + "loss": 3.0981, + "theoretical_loss": 4.040385337780718, + "tokens_seen": 375655424 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044755265797392175, + "loss": 3.2256, + "theoretical_loss": 4.040305651527252, + "tokens_seen": 375720960 + }, + { + "epoch": 1.01, + "learning_rate": 0.000447542627883651, + "loss": 3.1743, + "theoretical_loss": 4.040225983063108, + "tokens_seen": 375786496 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004475325977933801, + "loss": 3.1277, + "theoretical_loss": 4.040146332381214, + "tokens_seen": 375852032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044752256770310935, + "loss": 3.3352, + "theoretical_loss": 4.040066699474501, + "tokens_seen": 375917568 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004475125376128385, + "loss": 3.1664, + "theoretical_loss": 4.0399870843359045, + "tokens_seen": 375983104 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004475025075225677, + "loss": 3.1807, + "theoretical_loss": 4.039907486958365, + "tokens_seen": 376048640 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004474924774322969, + "loss": 3.3548, + "theoretical_loss": 4.039827907334824, + "tokens_seen": 376114176 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004474824473420261, + "loss": 3.2523, + "theoretical_loss": 4.0397483454582295, + "tokens_seen": 376179712 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044747241725175526, + "loss": 3.1865, + "theoretical_loss": 4.039668801321534, + "tokens_seen": 376245248 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044746238716148444, + "loss": 3.1466, + "theoretical_loss": 4.039589274917693, + "tokens_seen": 376310784 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004474523570712136, + "loss": 3.0607, + "theoretical_loss": 4.039509766239665, + "tokens_seen": 376376320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044744232698094285, + "loss": 3.1268, + "theoretical_loss": 4.039430275280415, + "tokens_seen": 376441856 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044743229689067204, + "loss": 3.1918, + "theoretical_loss": 4.03935080203291, + "tokens_seen": 376507392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004474222668004012, + "loss": 3.2706, + "theoretical_loss": 4.03927134649012, + "tokens_seen": 376572928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044741223671013045, + "loss": 3.3206, + "theoretical_loss": 4.039191908645024, + "tokens_seen": 376638464 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004474022066198596, + "loss": 3.1134, + "theoretical_loss": 4.0391124884905985, + "tokens_seen": 376704000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473921765295888, + "loss": 3.185, + "theoretical_loss": 4.039033086019829, + "tokens_seen": 376769536 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 626181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1734139919281006, + "objective/train/theoretical_loss": 4.0389735457674325, + "objective/train/tokens_used": 397278688, + "theoretical_loss": 4.0389735457674325, + "tokens_seen": 376818688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044738214643931794, + "loss": 3.1223, + "theoretical_loss": 4.038953701225703, + "tokens_seen": 376835072 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473721163490472, + "loss": 3.133, + "theoretical_loss": 4.0388743341012106, + "tokens_seen": 376900608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044736208625877636, + "loss": 3.3066, + "theoretical_loss": 4.03879498463935, + "tokens_seen": 376966144 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044735205616850554, + "loss": 3.1376, + "theoretical_loss": 4.038715652833118, + "tokens_seen": 377031680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473420260782347, + "loss": 3.2057, + "theoretical_loss": 4.038636338675521, + "tokens_seen": 377097216 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473319959879639, + "loss": 3.1925, + "theoretical_loss": 4.038557042159566, + "tokens_seen": 377162752 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473219658976931, + "loss": 3.2862, + "theoretical_loss": 4.038477763278262, + "tokens_seen": 377228288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004473119358074223, + "loss": 3.1208, + "theoretical_loss": 4.038398502024628, + "tokens_seen": 377293824 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044730190571715144, + "loss": 3.0717, + "theoretical_loss": 4.038319258391682, + "tokens_seen": 377359360 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472918756268807, + "loss": 3.1778, + "theoretical_loss": 4.038240032372447, + "tokens_seen": 377424896 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472818455366098, + "loss": 3.1321, + "theoretical_loss": 4.038160823959952, + "tokens_seen": 377490432 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044727181544633904, + "loss": 3.2548, + "theoretical_loss": 4.038081633147227, + "tokens_seen": 377555968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472617853560682, + "loss": 3.0263, + "theoretical_loss": 4.038002459927309, + "tokens_seen": 377621504 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472517552657974, + "loss": 3.2498, + "theoretical_loss": 4.037923304293237, + "tokens_seen": 377687040 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472417251755266, + "loss": 3.1267, + "theoretical_loss": 4.037844166238053, + "tokens_seen": 377752576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472316950852558, + "loss": 3.3872, + "theoretical_loss": 4.037765045754806, + "tokens_seen": 377818112 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044722166499498495, + "loss": 3.0492, + "theoretical_loss": 4.037685942836546, + "tokens_seen": 377883648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472116349047142, + "loss": 3.1318, + "theoretical_loss": 4.03760685747633, + "tokens_seen": 377949184 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004472016048144433, + "loss": 3.1904, + "theoretical_loss": 4.037527789667216, + "tokens_seen": 378014720 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044719157472417254, + "loss": 3.1847, + "theoretical_loss": 4.037448739402267, + "tokens_seen": 378080256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004471815446339017, + "loss": 3.0733, + "theoretical_loss": 4.03736970667455, + "tokens_seen": 378145792 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004471715145436309, + "loss": 3.0748, + "theoretical_loss": 4.0372906914771365, + "tokens_seen": 378211328 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004471614844533601, + "loss": 3.1387, + "theoretical_loss": 4.037211693803101, + "tokens_seen": 378276864 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044715145436308927, + "loss": 3.3734, + "theoretical_loss": 4.037132713645525, + "tokens_seen": 378342400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044714142427281845, + "loss": 3.2528, + "theoretical_loss": 4.037053750997487, + "tokens_seen": 378407936 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 629062, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.992845058441162, + "objective/train/theoretical_loss": 4.036994540497936, + "objective/train/tokens_used": 398917088, + "theoretical_loss": 4.036994540497936, + "tokens_seen": 378457088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004471313941825477, + "loss": 3.1005, + "theoretical_loss": 4.0369748058520765, + "tokens_seen": 378473472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004471213640922768, + "loss": 3.1614, + "theoretical_loss": 4.036895878202383, + "tokens_seen": 378539008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044711133400200605, + "loss": 3.0822, + "theoretical_loss": 4.036816968041503, + "tokens_seen": 378604544 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044710130391173523, + "loss": 3.2644, + "theoretical_loss": 4.036738075362533, + "tokens_seen": 378670080 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470912738214644, + "loss": 3.3427, + "theoretical_loss": 4.036659200158576, + "tokens_seen": 378735616 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470812437311936, + "loss": 3.0353, + "theoretical_loss": 4.036580342422739, + "tokens_seen": 378801152 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044707121364092277, + "loss": 2.9781, + "theoretical_loss": 4.036501502148132, + "tokens_seen": 378866688 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044706118355065195, + "loss": 2.9636, + "theoretical_loss": 4.036422679327869, + "tokens_seen": 378932224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470511534603812, + "loss": 3.2934, + "theoretical_loss": 4.036343873955068, + "tokens_seen": 378997760 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470411233701103, + "loss": 3.1731, + "theoretical_loss": 4.036265086022851, + "tokens_seen": 379063296 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044703109327983955, + "loss": 3.2596, + "theoretical_loss": 4.036186315524344, + "tokens_seen": 379128832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470210631895687, + "loss": 3.151, + "theoretical_loss": 4.036107562452677, + "tokens_seen": 379194368 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470110330992979, + "loss": 3.0272, + "theoretical_loss": 4.036028826800983, + "tokens_seen": 379259904 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004470010030090271, + "loss": 3.2735, + "theoretical_loss": 4.035950108562401, + "tokens_seen": 379325440 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469909729187563, + "loss": 3.1764, + "theoretical_loss": 4.035871407730071, + "tokens_seen": 379390976 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044698094282848546, + "loss": 3.2364, + "theoretical_loss": 4.035792724297139, + "tokens_seen": 379456512 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044697091273821464, + "loss": 3.0163, + "theoretical_loss": 4.0357140582567546, + "tokens_seen": 379522048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469608826479438, + "loss": 3.2411, + "theoretical_loss": 4.03563540960207, + "tokens_seen": 379587584 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044695085255767305, + "loss": 3.1991, + "theoretical_loss": 4.035556778326242, + "tokens_seen": 379653120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469408224674022, + "loss": 3.078, + "theoretical_loss": 4.035478164422434, + "tokens_seen": 379718656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469307923771314, + "loss": 3.1815, + "theoretical_loss": 4.0353995678838075, + "tokens_seen": 379784192 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469207622868606, + "loss": 3.1284, + "theoretical_loss": 4.035320988703533, + "tokens_seen": 379849728 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004469107321965898, + "loss": 3.3438, + "theoretical_loss": 4.035242426874782, + "tokens_seen": 379915264 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044690070210631896, + "loss": 3.0859, + "theoretical_loss": 4.035163882390732, + "tokens_seen": 379980800 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044689067201604814, + "loss": 3.117, + "theoretical_loss": 4.035085355244561, + "tokens_seen": 380046336 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 631903, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0521645545959473, + "objective/train/theoretical_loss": 4.035026471258818, + "objective/train/tokens_used": 400555488, + "theoretical_loss": 4.035026471258818, + "tokens_seen": 380095488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004468806419257773, + "loss": 3.0631, + "theoretical_loss": 4.035006845429456, + "tokens_seen": 380111872 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044687061183550656, + "loss": 3.1317, + "theoretical_loss": 4.0349283529386035, + "tokens_seen": 380177408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004468605817452357, + "loss": 3.224, + "theoretical_loss": 4.034849877765194, + "tokens_seen": 380242944 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004468505516549649, + "loss": 3.0421, + "theoretical_loss": 4.034771419902425, + "tokens_seen": 380308480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044684052156469405, + "loss": 3.1519, + "theoretical_loss": 4.034692979343495, + "tokens_seen": 380374016 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004468304914744233, + "loss": 3.1461, + "theoretical_loss": 4.034614556081609, + "tokens_seen": 380439552 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044682046138415246, + "loss": 3.0569, + "theoretical_loss": 4.034536150109971, + "tokens_seen": 380505088 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044681043129388164, + "loss": 3.1469, + "theoretical_loss": 4.034457761421794, + "tokens_seen": 380570624 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004468004012036108, + "loss": 3.2402, + "theoretical_loss": 4.034379390010292, + "tokens_seen": 380636160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044679037111334, + "loss": 3.2451, + "theoretical_loss": 4.034301035868685, + "tokens_seen": 380701696 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004467803410230692, + "loss": 3.0975, + "theoretical_loss": 4.034222698990194, + "tokens_seen": 380767232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004467703109327984, + "loss": 3.1479, + "theoretical_loss": 4.034144379368046, + "tokens_seen": 380832768 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044676028084252755, + "loss": 3.2175, + "theoretical_loss": 4.0340660769954715, + "tokens_seen": 380898304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004467502507522568, + "loss": 3.1303, + "theoretical_loss": 4.033987791865703, + "tokens_seen": 380963840 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044674022066198597, + "loss": 3.1153, + "theoretical_loss": 4.03390952397198, + "tokens_seen": 381029376 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044673019057171515, + "loss": 3.321, + "theoretical_loss": 4.033831273307542, + "tokens_seen": 381094912 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044672016048144433, + "loss": 3.3297, + "theoretical_loss": 4.033753039865637, + "tokens_seen": 381160448 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004467101303911735, + "loss": 3.2717, + "theoretical_loss": 4.033674823639512, + "tokens_seen": 381225984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004467001003009027, + "loss": 3.2902, + "theoretical_loss": 4.033596624622421, + "tokens_seen": 381291520 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004466900702106319, + "loss": 3.2373, + "theoretical_loss": 4.03351844280762, + "tokens_seen": 381357056 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004466800401203611, + "loss": 3.2396, + "theoretical_loss": 4.033440278188371, + "tokens_seen": 381422592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004466700100300903, + "loss": 3.1023, + "theoretical_loss": 4.033362130757936, + "tokens_seen": 381488128 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044665997993981947, + "loss": 3.3712, + "theoretical_loss": 4.033284000509586, + "tokens_seen": 381553664 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044664994984954865, + "loss": 3.1716, + "theoretical_loss": 4.033205887436592, + "tokens_seen": 381619200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004466399197592779, + "loss": 3.2322, + "theoretical_loss": 4.033127791532229, + "tokens_seen": 381684736 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 633212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.998805284500122, + "objective/train/theoretical_loss": 4.033069230866828, + "objective/train/tokens_used": 402193888, + "theoretical_loss": 4.033069230866828, + "tokens_seen": 381733888 + }, + { + "epoch": 1.01, + "learning_rate": 0.000446629889669007, + "loss": 3.2531, + "theoretical_loss": 4.0330497127897775, + "tokens_seen": 381750272 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044661985957873625, + "loss": 2.9961, + "theoretical_loss": 4.032971651202519, + "tokens_seen": 381815808 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044660982948846543, + "loss": 3.0884, + "theoretical_loss": 4.032893606763744, + "tokens_seen": 381881344 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465997993981946, + "loss": 3.2085, + "theoretical_loss": 4.03281557946674, + "tokens_seen": 381946880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465897693079238, + "loss": 3.1997, + "theoretical_loss": 4.032737569304803, + "tokens_seen": 382012416 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044657973921765297, + "loss": 3.0105, + "theoretical_loss": 4.032659576271232, + "tokens_seen": 382077952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044656970912738215, + "loss": 3.1797, + "theoretical_loss": 4.032581600359329, + "tokens_seen": 382143488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465596790371114, + "loss": 3.2436, + "theoretical_loss": 4.0325036415624, + "tokens_seen": 382209024 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465496489468405, + "loss": 3.2219, + "theoretical_loss": 4.0324256998737535, + "tokens_seen": 382274560 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044653961885656975, + "loss": 3.2311, + "theoretical_loss": 4.032347775286704, + "tokens_seen": 382340096 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465295887662989, + "loss": 3.3359, + "theoretical_loss": 4.03226986779457, + "tokens_seen": 382405632 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465195586760281, + "loss": 3.2003, + "theoretical_loss": 4.03219197739067, + "tokens_seen": 382471168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004465095285857573, + "loss": 3.287, + "theoretical_loss": 4.032114104068331, + "tokens_seen": 382536704 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004464994984954865, + "loss": 3.1892, + "theoretical_loss": 4.032036247820879, + "tokens_seen": 382602240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044648946840521566, + "loss": 3.2536, + "theoretical_loss": 4.03195840864165, + "tokens_seen": 382667776 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044647943831494484, + "loss": 3.1457, + "theoretical_loss": 4.031880586523976, + "tokens_seen": 382733312 + }, + { + "epoch": 1.01, + "learning_rate": 0.000446469408224674, + "loss": 3.1563, + "theoretical_loss": 4.0318027814612, + "tokens_seen": 382798848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044645937813440325, + "loss": 3.2129, + "theoretical_loss": 4.031724993446663, + "tokens_seen": 382864384 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004464493480441324, + "loss": 3.1512, + "theoretical_loss": 4.031647222473714, + "tokens_seen": 382929920 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004464393179538616, + "loss": 3.2398, + "theoretical_loss": 4.031569468535704, + "tokens_seen": 382995456 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004464292878635908, + "loss": 3.0868, + "theoretical_loss": 4.031491731625986, + "tokens_seen": 383060992 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044641925777332, + "loss": 3.0105, + "theoretical_loss": 4.03141401173792, + "tokens_seen": 383126528 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044640922768304916, + "loss": 3.0205, + "theoretical_loss": 4.0313363088648675, + "tokens_seen": 383192064 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044639919759277834, + "loss": 3.197, + "theoretical_loss": 4.031258623000195, + "tokens_seen": 383257600 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004463891675025075, + "loss": 3.2168, + "theoretical_loss": 4.031180954137271, + "tokens_seen": 383323136 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 637151, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.914140462875366, + "objective/train/theoretical_loss": 4.03112271364349, + "objective/train/tokens_used": 403832288, + "theoretical_loss": 4.03112271364349, + "tokens_seen": 383372288 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044637913741223676, + "loss": 2.9733, + "theoretical_loss": 4.03110330226947, + "tokens_seen": 383388672 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004463691073219659, + "loss": 3.1659, + "theoretical_loss": 4.0310256673901685, + "tokens_seen": 383454208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004463590772316951, + "loss": 3.2536, + "theoretical_loss": 4.030948049492747, + "tokens_seen": 383519744 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044634904714142425, + "loss": 3.3305, + "theoretical_loss": 4.030870448570591, + "tokens_seen": 383585280 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004463390170511535, + "loss": 3.0542, + "theoretical_loss": 4.030792864617087, + "tokens_seen": 383650816 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044632898696088266, + "loss": 3.1873, + "theoretical_loss": 4.030715297625628, + "tokens_seen": 383716352 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044631895687061184, + "loss": 3.2976, + "theoretical_loss": 4.030637747589609, + "tokens_seen": 383781888 + }, + { + "epoch": 1.01, + "learning_rate": 0.000446308926780341, + "loss": 3.2325, + "theoretical_loss": 4.03056021450243, + "tokens_seen": 383847424 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004462988966900702, + "loss": 3.0642, + "theoretical_loss": 4.030482698357494, + "tokens_seen": 383912960 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004462888665997994, + "loss": 3.2825, + "theoretical_loss": 4.030405199148206, + "tokens_seen": 383978496 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004462788365095286, + "loss": 3.1564, + "theoretical_loss": 4.030327716867979, + "tokens_seen": 384044032 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044626880641925775, + "loss": 3.0732, + "theoretical_loss": 4.030250251510225, + "tokens_seen": 384109568 + }, + { + "epoch": 1.01, + "learning_rate": 0.000446258776328987, + "loss": 3.2444, + "theoretical_loss": 4.030172803068362, + "tokens_seen": 384175104 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044624874623871617, + "loss": 3.1888, + "theoretical_loss": 4.030095371535813, + "tokens_seen": 384240640 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044623871614844535, + "loss": 3.2553, + "theoretical_loss": 4.030017956906001, + "tokens_seen": 384306176 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044622868605817453, + "loss": 3.2857, + "theoretical_loss": 4.029940559172355, + "tokens_seen": 384371712 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004462186559679037, + "loss": 3.1903, + "theoretical_loss": 4.029863178328309, + "tokens_seen": 384437248 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004462086258776329, + "loss": 3.1897, + "theoretical_loss": 4.0297858143672975, + "tokens_seen": 384502784 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004461985957873621, + "loss": 3.183, + "theoretical_loss": 4.029708467282761, + "tokens_seen": 384568320 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044618856569709125, + "loss": 3.193, + "theoretical_loss": 4.029631137068144, + "tokens_seen": 384633856 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004461785356068205, + "loss": 3.0665, + "theoretical_loss": 4.029553823716891, + "tokens_seen": 384699392 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004461685055165496, + "loss": 3.2558, + "theoretical_loss": 4.029476527222455, + "tokens_seen": 384764928 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044615847542627885, + "loss": 3.0548, + "theoretical_loss": 4.029399247578289, + "tokens_seen": 384830464 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044614844533600803, + "loss": 3.136, + "theoretical_loss": 4.029321984777853, + "tokens_seen": 384896000 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004461384152457372, + "loss": 3.0649, + "theoretical_loss": 4.029244738814607, + "tokens_seen": 384961536 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 638614, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8348186016082764, + "objective/train/theoretical_loss": 4.029186815387647, + "objective/train/tokens_used": 405470688, + "theoretical_loss": 4.029186815387647, + "tokens_seen": 385010688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004461283851554664, + "loss": 2.9834, + "theoretical_loss": 4.029167509682017, + "tokens_seen": 385027072 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044611835506519563, + "loss": 3.2526, + "theoretical_loss": 4.029090297373552, + "tokens_seen": 385092608 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044610832497492476, + "loss": 3.0273, + "theoretical_loss": 4.029013101882684, + "tokens_seen": 385158144 + }, + { + "epoch": 1.01, + "learning_rate": 0.000446098294884654, + "loss": 3.0522, + "theoretical_loss": 4.02893592320289, + "tokens_seen": 385223680 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460882647943831, + "loss": 3.1567, + "theoretical_loss": 4.02885876132765, + "tokens_seen": 385289216 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044607823470411235, + "loss": 3.209, + "theoretical_loss": 4.0287816162504475, + "tokens_seen": 385354752 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044606820461384153, + "loss": 3.1157, + "theoretical_loss": 4.02870448796477, + "tokens_seen": 385420288 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460581745235707, + "loss": 3.2586, + "theoretical_loss": 4.028627376464108, + "tokens_seen": 385485824 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460481444332999, + "loss": 3.1013, + "theoretical_loss": 4.028550281741957, + "tokens_seen": 385551360 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460381143430291, + "loss": 3.2209, + "theoretical_loss": 4.028473203791813, + "tokens_seen": 385616896 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044602808425275826, + "loss": 3.3411, + "theoretical_loss": 4.028396142607179, + "tokens_seen": 385682432 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460180541624875, + "loss": 3.2172, + "theoretical_loss": 4.028319098181561, + "tokens_seen": 385747968 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004460080240722166, + "loss": 3.2545, + "theoretical_loss": 4.028242070508467, + "tokens_seen": 385813504 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044599799398194586, + "loss": 3.0967, + "theoretical_loss": 4.0281650595814105, + "tokens_seen": 385879040 + }, + { + "epoch": 1.01, + "learning_rate": 0.000445987963891675, + "loss": 3.207, + "theoretical_loss": 4.028088065393907, + "tokens_seen": 385944576 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004459779338014042, + "loss": 3.2203, + "theoretical_loss": 4.0280110879394755, + "tokens_seen": 386010112 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004459679037111334, + "loss": 3.1775, + "theoretical_loss": 4.027934127211641, + "tokens_seen": 386075648 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004459578736208626, + "loss": 3.2597, + "theoretical_loss": 4.027857183203931, + "tokens_seen": 386141184 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044594784353059176, + "loss": 3.1984, + "theoretical_loss": 4.0277802559098745, + "tokens_seen": 386206720 + }, + { + "epoch": 1.01, + "learning_rate": 0.000445937813440321, + "loss": 3.1651, + "theoretical_loss": 4.027703345323006, + "tokens_seen": 386272256 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004459277833500502, + "loss": 3.1485, + "theoretical_loss": 4.027626451436864, + "tokens_seen": 386337792 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044591775325977936, + "loss": 3.1644, + "theoretical_loss": 4.027549574244989, + "tokens_seen": 386403328 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044590772316950854, + "loss": 3.1233, + "theoretical_loss": 4.027472713740927, + "tokens_seen": 386468864 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458976930792377, + "loss": 3.0353, + "theoretical_loss": 4.027395869918227, + "tokens_seen": 386534400 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044588766298896696, + "loss": 3.1572, + "theoretical_loss": 4.02731904277044, + "tokens_seen": 386599936 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 641636, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8967413902282715, + "objective/train/theoretical_loss": 4.0272614333486345, + "objective/train/tokens_used": 407109088, + "theoretical_loss": 4.0272614333486345, + "tokens_seen": 386649088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458776328986961, + "loss": 3.0821, + "theoretical_loss": 4.027242232291122, + "tokens_seen": 386665472 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458676028084253, + "loss": 3.2112, + "theoretical_loss": 4.0271654384738325, + "tokens_seen": 386731008 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044585757271815445, + "loss": 3.2202, + "theoretical_loss": 4.027088661312135, + "tokens_seen": 386796544 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458475426278837, + "loss": 3.317, + "theoretical_loss": 4.027011900799597, + "tokens_seen": 386862080 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044583751253761286, + "loss": 3.2001, + "theoretical_loss": 4.026935156929785, + "tokens_seen": 386927616 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044582748244734204, + "loss": 3.2286, + "theoretical_loss": 4.026858429696276, + "tokens_seen": 386993152 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458174523570712, + "loss": 3.3174, + "theoretical_loss": 4.0267817190926465, + "tokens_seen": 387058688 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004458074222668004, + "loss": 3.1861, + "theoretical_loss": 4.026705025112476, + "tokens_seen": 387124224 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457973921765296, + "loss": 3.1152, + "theoretical_loss": 4.026628347749351, + "tokens_seen": 387189760 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457873620862588, + "loss": 3.2971, + "theoretical_loss": 4.026551686996857, + "tokens_seen": 387255296 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044577733199598795, + "loss": 3.054, + "theoretical_loss": 4.026475042848588, + "tokens_seen": 387320832 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457673019057172, + "loss": 3.3026, + "theoretical_loss": 4.026398415298138, + "tokens_seen": 387386368 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044575727181544637, + "loss": 3.038, + "theoretical_loss": 4.026321804339105, + "tokens_seen": 387451904 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044574724172517555, + "loss": 3.1337, + "theoretical_loss": 4.026245209965092, + "tokens_seen": 387517440 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044573721163490473, + "loss": 3.0765, + "theoretical_loss": 4.026168632169703, + "tokens_seen": 387582976 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457271815446339, + "loss": 3.2577, + "theoretical_loss": 4.02609207094655, + "tokens_seen": 387648512 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457171514543631, + "loss": 3.0583, + "theoretical_loss": 4.026015526289244, + "tokens_seen": 387714048 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004457071213640923, + "loss": 3.0926, + "theoretical_loss": 4.0259389981914016, + "tokens_seen": 387779584 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044569709127382145, + "loss": 3.1185, + "theoretical_loss": 4.025862486646643, + "tokens_seen": 387845120 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004456870611835507, + "loss": 3.0658, + "theoretical_loss": 4.025785991648592, + "tokens_seen": 387910656 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004456770310932798, + "loss": 3.2336, + "theoretical_loss": 4.025709513190874, + "tokens_seen": 387976192 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044566700100300905, + "loss": 3.0969, + "theoretical_loss": 4.025633051267121, + "tokens_seen": 388041728 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044565697091273823, + "loss": 3.2491, + "theoretical_loss": 4.025556605870966, + "tokens_seen": 388107264 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004456469408224674, + "loss": 3.1633, + "theoretical_loss": 4.025480176996047, + "tokens_seen": 388172800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004456369107321966, + "loss": 3.2533, + "theoretical_loss": 4.025403764636005, + "tokens_seen": 388238336 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 644548, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2800955772399902, + "objective/train/theoretical_loss": 4.025346466200038, + "objective/train/tokens_used": 408747488, + "theoretical_loss": 4.025346466200038, + "tokens_seen": 388287488 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044562688064192583, + "loss": 3.2207, + "theoretical_loss": 4.025327368784485, + "tokens_seen": 388303872 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044561685055165496, + "loss": 2.9839, + "theoretical_loss": 4.0252509894351345, + "tokens_seen": 388369408 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004456068204613842, + "loss": 3.0701, + "theoretical_loss": 4.025174626581606, + "tokens_seen": 388434944 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455967903711133, + "loss": 3.0149, + "theoretical_loss": 4.025098280217552, + "tokens_seen": 388500480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044558676028084255, + "loss": 3.2593, + "theoretical_loss": 4.025021950336635, + "tokens_seen": 388566016 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044557673019057173, + "loss": 3.154, + "theoretical_loss": 4.0249456369325145, + "tokens_seen": 388631552 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455667001003009, + "loss": 3.3953, + "theoretical_loss": 4.024869339998856, + "tokens_seen": 388697088 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455566700100301, + "loss": 3.3765, + "theoretical_loss": 4.024793059529331, + "tokens_seen": 388762624 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455466399197593, + "loss": 3.1836, + "theoretical_loss": 4.02471679551761, + "tokens_seen": 388828160 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044553660982948846, + "loss": 3.148, + "theoretical_loss": 4.024640547957369, + "tokens_seen": 388893696 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455265797392177, + "loss": 3.3165, + "theoretical_loss": 4.024564316842289, + "tokens_seen": 388959232 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004455165496489468, + "loss": 3.1835, + "theoretical_loss": 4.024488102166052, + "tokens_seen": 389024768 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044550651955867606, + "loss": 3.2346, + "theoretical_loss": 4.024411903922346, + "tokens_seen": 389090304 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454964894684052, + "loss": 3.1798, + "theoretical_loss": 4.02433572210486, + "tokens_seen": 389155840 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454864593781344, + "loss": 3.194, + "theoretical_loss": 4.024259556707287, + "tokens_seen": 389221376 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454764292878636, + "loss": 3.1143, + "theoretical_loss": 4.024183407723326, + "tokens_seen": 389286912 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454663991975928, + "loss": 3.1828, + "theoretical_loss": 4.024107275146676, + "tokens_seen": 389352448 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044545636910732196, + "loss": 3.3196, + "theoretical_loss": 4.024031158971042, + "tokens_seen": 389417984 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454463390170512, + "loss": 3.3109, + "theoretical_loss": 4.02395505919013, + "tokens_seen": 389483520 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454363089267803, + "loss": 3.0763, + "theoretical_loss": 4.023878975797652, + "tokens_seen": 389549056 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044542627883650956, + "loss": 3.1387, + "theoretical_loss": 4.0238029087873235, + "tokens_seen": 389614592 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454162487462387, + "loss": 3.3429, + "theoretical_loss": 4.023726858152861, + "tokens_seen": 389680128 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004454062186559679, + "loss": 3.2159, + "theoretical_loss": 4.023650823887985, + "tokens_seen": 389745664 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004453961885656971, + "loss": 3.1462, + "theoretical_loss": 4.023574805986423, + "tokens_seen": 389811200 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004453861584754263, + "loss": 3.1073, + "theoretical_loss": 4.0234988044419016, + "tokens_seen": 389876736 + }, + { + "epoch": 1.01, + "objective/train/docs_used": 647342, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.134502649307251, + "objective/train/theoretical_loss": 4.023441814014048, + "objective/train/tokens_used": 410385888, + "theoretical_loss": 4.023441814014048, + "tokens_seen": 389925888 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044537612838515547, + "loss": 3.1061, + "theoretical_loss": 4.023422819248153, + "tokens_seen": 389942272 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044536609829488465, + "loss": 3.1038, + "theoretical_loss": 4.023346850398912, + "tokens_seen": 390007808 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044535606820461383, + "loss": 3.1371, + "theoretical_loss": 4.023270897887917, + "tokens_seen": 390073344 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044534603811434306, + "loss": 3.0855, + "theoretical_loss": 4.023194961708912, + "tokens_seen": 390138880 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004453360080240722, + "loss": 3.3188, + "theoretical_loss": 4.02311904185564, + "tokens_seen": 390204416 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004453259779338014, + "loss": 3.0486, + "theoretical_loss": 4.023043138321851, + "tokens_seen": 390269952 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044531594784353055, + "loss": 3.1407, + "theoretical_loss": 4.022967251101298, + "tokens_seen": 390335488 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004453059177532598, + "loss": 3.2067, + "theoretical_loss": 4.022891380187737, + "tokens_seen": 390401024 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044529588766298897, + "loss": 3.1123, + "theoretical_loss": 4.022815525574927, + "tokens_seen": 390466560 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044528585757271815, + "loss": 3.234, + "theoretical_loss": 4.02273968725663, + "tokens_seen": 390532096 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044527582748244733, + "loss": 3.2708, + "theoretical_loss": 4.022663865226614, + "tokens_seen": 390597632 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044526579739217657, + "loss": 3.2436, + "theoretical_loss": 4.022588059478647, + "tokens_seen": 390663168 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004452557673019057, + "loss": 3.2767, + "theoretical_loss": 4.0225122700065015, + "tokens_seen": 390728704 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044524573721163493, + "loss": 3.0328, + "theoretical_loss": 4.022436496803956, + "tokens_seen": 390794240 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044523570712136406, + "loss": 3.2426, + "theoretical_loss": 4.022360739864789, + "tokens_seen": 390859776 + }, + { + "epoch": 1.01, + "learning_rate": 0.0004452256770310933, + "loss": 2.986, + "theoretical_loss": 4.022284999182785, + "tokens_seen": 390925312 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044521564694082247, + "loss": 3.2245, + "theoretical_loss": 4.02220927475173, + "tokens_seen": 390990848 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044520561685055165, + "loss": 3.3567, + "theoretical_loss": 4.022133566565413, + "tokens_seen": 391056384 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044519558676028083, + "loss": 3.1039, + "theoretical_loss": 4.02205787461763, + "tokens_seen": 391121920 + }, + { + "epoch": 1.01, + "learning_rate": 0.00044518555667001, + "loss": 3.1865, + "theoretical_loss": 4.021982198902176, + "tokens_seen": 391187456 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044517552657973925, + "loss": 3.2012, + "theoretical_loss": 4.0219065394128535, + "tokens_seen": 391252992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044516549648946843, + "loss": 3.1996, + "theoretical_loss": 4.021830896143463, + "tokens_seen": 391318528 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004451554663991976, + "loss": 3.1932, + "theoretical_loss": 4.021755269087815, + "tokens_seen": 391384064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004451454363089268, + "loss": 3.0962, + "theoretical_loss": 4.0216796582397185, + "tokens_seen": 391449600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044513540621865603, + "loss": 3.1985, + "theoretical_loss": 4.021604063592988, + "tokens_seen": 391515136 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 649632, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9291982650756836, + "objective/train/theoretical_loss": 4.021547378236367, + "objective/train/tokens_used": 412024288, + "theoretical_loss": 4.021547378236367, + "tokens_seen": 391564288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044512537612838516, + "loss": 2.9919, + "theoretical_loss": 4.02152848514144, + "tokens_seen": 391580672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004451153460381144, + "loss": 3.1339, + "theoretical_loss": 4.021452922878896, + "tokens_seen": 391646208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004451053159478435, + "loss": 3.1086, + "theoretical_loss": 4.0213773767991805, + "tokens_seen": 391711744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044509528585757275, + "loss": 3.0535, + "theoretical_loss": 4.021301846896121, + "tokens_seen": 391777280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044508525576730194, + "loss": 2.9872, + "theoretical_loss": 4.021226333163547, + "tokens_seen": 391842816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004450752256770311, + "loss": 3.1937, + "theoretical_loss": 4.021150835595295, + "tokens_seen": 391908352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004450651955867603, + "loss": 3.0365, + "theoretical_loss": 4.021075354185201, + "tokens_seen": 391973888 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004450551654964895, + "loss": 3.0135, + "theoretical_loss": 4.020999888927107, + "tokens_seen": 392039424 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044504513540621866, + "loss": 3.1556, + "theoretical_loss": 4.020924439814857, + "tokens_seen": 392104960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004450351053159479, + "loss": 3.3711, + "theoretical_loss": 4.0208490068423, + "tokens_seen": 392170496 + }, + { + "epoch": 1.02, + "learning_rate": 0.000445025075225677, + "loss": 3.1352, + "theoretical_loss": 4.0207735900032855, + "tokens_seen": 392236032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044501504513540626, + "loss": 3.1299, + "theoretical_loss": 4.02069818929167, + "tokens_seen": 392301568 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004450050150451354, + "loss": 3.1064, + "theoretical_loss": 4.0206228047013095, + "tokens_seen": 392367104 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449949849548646, + "loss": 3.1937, + "theoretical_loss": 4.020547436226067, + "tokens_seen": 392432640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449849548645938, + "loss": 3.1136, + "theoretical_loss": 4.020472083859806, + "tokens_seen": 392498176 + }, + { + "epoch": 1.02, + "learning_rate": 0.000444974924774323, + "loss": 3.0878, + "theoretical_loss": 4.020396747596395, + "tokens_seen": 392563712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044496489468405216, + "loss": 3.2516, + "theoretical_loss": 4.020321427429705, + "tokens_seen": 392629248 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449548645937814, + "loss": 3.1303, + "theoretical_loss": 4.020246123353612, + "tokens_seen": 392694784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449448345035105, + "loss": 3.0496, + "theoretical_loss": 4.020170835361992, + "tokens_seen": 392760320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044493480441323976, + "loss": 3.1409, + "theoretical_loss": 4.020095563448729, + "tokens_seen": 392825856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449247743229689, + "loss": 3.2253, + "theoretical_loss": 4.020020307607706, + "tokens_seen": 392891392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449147442326981, + "loss": 3.0753, + "theoretical_loss": 4.019945067832811, + "tokens_seen": 392956928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004449047141424273, + "loss": 3.0196, + "theoretical_loss": 4.019869844117938, + "tokens_seen": 393022464 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004448946840521565, + "loss": 3.143, + "theoretical_loss": 4.019794636456979, + "tokens_seen": 393088000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044488465396188567, + "loss": 3.233, + "theoretical_loss": 4.019719444843833, + "tokens_seen": 393153536 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 652342, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6542298793792725, + "objective/train/theoretical_loss": 4.0196630616616815, + "objective/train/tokens_used": 413662688, + "theoretical_loss": 4.0196630616616815, + "tokens_seen": 393202688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044487462387161485, + "loss": 3.0277, + "theoretical_loss": 4.019644269272401, + "tokens_seen": 393219072 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044486459378134403, + "loss": 3.2137, + "theoretical_loss": 4.01956910973659, + "tokens_seen": 393284608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044485456369107326, + "loss": 3.1393, + "theoretical_loss": 4.019493966230306, + "tokens_seen": 393350144 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004448445336008024, + "loss": 3.2826, + "theoretical_loss": 4.019418838747462, + "tokens_seen": 393415680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004448345035105316, + "loss": 3.183, + "theoretical_loss": 4.019343727281971, + "tokens_seen": 393481216 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044482447342026075, + "loss": 3.078, + "theoretical_loss": 4.019268631827752, + "tokens_seen": 393546752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044481444332999, + "loss": 3.1833, + "theoretical_loss": 4.019193552378728, + "tokens_seen": 393612288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044480441323971917, + "loss": 3.1218, + "theoretical_loss": 4.019118488928822, + "tokens_seen": 393677824 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044479438314944835, + "loss": 3.1452, + "theoretical_loss": 4.019043441471962, + "tokens_seen": 393743360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044478435305917753, + "loss": 3.2086, + "theoretical_loss": 4.01896841000208, + "tokens_seen": 393808896 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044477432296890677, + "loss": 3.2029, + "theoretical_loss": 4.018893394513112, + "tokens_seen": 393874432 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004447642928786359, + "loss": 3.2286, + "theoretical_loss": 4.018818394998994, + "tokens_seen": 393939968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044475426278836513, + "loss": 3.0273, + "theoretical_loss": 4.018743411453668, + "tokens_seen": 394005504 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044474423269809426, + "loss": 3.1475, + "theoretical_loss": 4.018668443871079, + "tokens_seen": 394071040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004447342026078235, + "loss": 3.2064, + "theoretical_loss": 4.018593492245175, + "tokens_seen": 394136576 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044472417251755267, + "loss": 3.2534, + "theoretical_loss": 4.018518556569908, + "tokens_seen": 394202112 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044471414242728185, + "loss": 3.2696, + "theoretical_loss": 4.018443636839231, + "tokens_seen": 394267648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044470411233701103, + "loss": 3.0836, + "theoretical_loss": 4.018368733047102, + "tokens_seen": 394333184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004446940822467402, + "loss": 3.4221, + "theoretical_loss": 4.018293845187483, + "tokens_seen": 394398720 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004446840521564694, + "loss": 3.1603, + "theoretical_loss": 4.018218973254338, + "tokens_seen": 394464256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044467402206619863, + "loss": 2.9791, + "theoretical_loss": 4.018144117241635, + "tokens_seen": 394529792 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044466399197592776, + "loss": 3.2082, + "theoretical_loss": 4.018069277143344, + "tokens_seen": 394595328 + }, + { + "epoch": 1.02, + "learning_rate": 0.000444653961885657, + "loss": 3.3634, + "theoretical_loss": 4.017994452953441, + "tokens_seen": 394660864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004446439317953861, + "loss": 3.2129, + "theoretical_loss": 4.017919644665903, + "tokens_seen": 394726400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044463390170511536, + "loss": 3.0515, + "theoretical_loss": 4.01784485227471, + "tokens_seen": 394791936 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 655042, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2554378509521484, + "objective/train/theoretical_loss": 4.017788768409673, + "objective/train/tokens_used": 415301088, + "theoretical_loss": 4.017788768409673, + "tokens_seen": 394841088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044462387161484454, + "loss": 3.2178, + "theoretical_loss": 4.017770075773846, + "tokens_seen": 394857472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004446138415245737, + "loss": 3.0772, + "theoretical_loss": 4.017695315157301, + "tokens_seen": 394923008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004446038114343029, + "loss": 3.1444, + "theoretical_loss": 4.017620570419063, + "tokens_seen": 394988544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044459378134403214, + "loss": 3.3153, + "theoretical_loss": 4.017545841553127, + "tokens_seen": 395054080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044458375125376126, + "loss": 3.163, + "theoretical_loss": 4.01747112855349, + "tokens_seen": 395119616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004445737211634905, + "loss": 3.1484, + "theoretical_loss": 4.017396431414152, + "tokens_seen": 395185152 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004445636910732196, + "loss": 3.2633, + "theoretical_loss": 4.017321750129118, + "tokens_seen": 395250688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044455366098294886, + "loss": 3.1052, + "theoretical_loss": 4.017247084692394, + "tokens_seen": 395316224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044454363089267804, + "loss": 3.2092, + "theoretical_loss": 4.01717243509799, + "tokens_seen": 395381760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004445336008024072, + "loss": 3.043, + "theoretical_loss": 4.01709780133992, + "tokens_seen": 395447296 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004445235707121364, + "loss": 3.1425, + "theoretical_loss": 4.017023183412203, + "tokens_seen": 395512832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004445135406218656, + "loss": 3.0887, + "theoretical_loss": 4.016948581308855, + "tokens_seen": 395578368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044450351053159477, + "loss": 3.1231, + "theoretical_loss": 4.016873995023902, + "tokens_seen": 395643904 + }, + { + "epoch": 1.02, + "learning_rate": 0.000444493480441324, + "loss": 3.1233, + "theoretical_loss": 4.016799424551369, + "tokens_seen": 395709440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044448345035105313, + "loss": 3.2139, + "theoretical_loss": 4.016724869885286, + "tokens_seen": 395774976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044447342026078236, + "loss": 3.2433, + "theoretical_loss": 4.016650331019688, + "tokens_seen": 395840512 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444633901705115, + "loss": 3.2393, + "theoretical_loss": 4.016575807948609, + "tokens_seen": 395906048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444533600802407, + "loss": 3.1951, + "theoretical_loss": 4.016501300666089, + "tokens_seen": 395971584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044444332998996996, + "loss": 3.2018, + "theoretical_loss": 4.016426809166172, + "tokens_seen": 396037120 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444332998996991, + "loss": 3.0324, + "theoretical_loss": 4.016352333442902, + "tokens_seen": 396102656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444232698094283, + "loss": 3.2193, + "theoretical_loss": 4.0162778734903295, + "tokens_seen": 396168192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444132397191575, + "loss": 3.1952, + "theoretical_loss": 4.016203429302507, + "tokens_seen": 396233728 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004444032096288867, + "loss": 3.0309, + "theoretical_loss": 4.016129000873489, + "tokens_seen": 396299264 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044439317953861587, + "loss": 3.0868, + "theoretical_loss": 4.016054588197336, + "tokens_seen": 396364800 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044438314944834505, + "loss": 3.1764, + "theoretical_loss": 4.015980191268109, + "tokens_seen": 396430336 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 657795, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.081172227859497, + "objective/train/theoretical_loss": 4.015924403901538, + "objective/train/tokens_used": 416939488, + "theoretical_loss": 4.015924403901538, + "tokens_seen": 396479488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044437311935807423, + "loss": 3.2853, + "theoretical_loss": 4.015905810079873, + "tokens_seen": 396495872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044436308926780346, + "loss": 3.1241, + "theoretical_loss": 4.015831444626697, + "tokens_seen": 396561408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004443530591775326, + "loss": 3.068, + "theoretical_loss": 4.0157570949026535, + "tokens_seen": 396626944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004443430290872618, + "loss": 3.3117, + "theoretical_loss": 4.015682760901816, + "tokens_seen": 396692480 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044433299899699095, + "loss": 3.1764, + "theoretical_loss": 4.015608442618264, + "tokens_seen": 396758016 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004443229689067202, + "loss": 3.1436, + "theoretical_loss": 4.015534140046078, + "tokens_seen": 396823552 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044431293881644937, + "loss": 3.2679, + "theoretical_loss": 4.015459853179342, + "tokens_seen": 396889088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044430290872617855, + "loss": 3.3819, + "theoretical_loss": 4.015385582012146, + "tokens_seen": 396954624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044429287863590773, + "loss": 3.0104, + "theoretical_loss": 4.01531132653858, + "tokens_seen": 397020160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044428284854563697, + "loss": 3.0533, + "theoretical_loss": 4.0152370867527365, + "tokens_seen": 397085696 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004442728184553661, + "loss": 3.1295, + "theoretical_loss": 4.015162862648714, + "tokens_seen": 397151232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044426278836509533, + "loss": 3.1765, + "theoretical_loss": 4.015088654220614, + "tokens_seen": 397216768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044425275827482446, + "loss": 3.3442, + "theoretical_loss": 4.01501446146254, + "tokens_seen": 397282304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004442427281845537, + "loss": 3.0135, + "theoretical_loss": 4.014940284368598, + "tokens_seen": 397347840 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044423269809428287, + "loss": 3.3941, + "theoretical_loss": 4.014866122932899, + "tokens_seen": 397413376 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044422266800401205, + "loss": 2.9625, + "theoretical_loss": 4.014791977149556, + "tokens_seen": 397478912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044421263791374123, + "loss": 3.1311, + "theoretical_loss": 4.014717847012685, + "tokens_seen": 397544448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004442026078234704, + "loss": 3.321, + "theoretical_loss": 4.014643732516407, + "tokens_seen": 397609984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004441925777331996, + "loss": 3.2165, + "theoretical_loss": 4.014569633654844, + "tokens_seen": 397675520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044418254764292883, + "loss": 3.1987, + "theoretical_loss": 4.014495550422121, + "tokens_seen": 397741056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044417251755265796, + "loss": 3.1377, + "theoretical_loss": 4.01442148281237, + "tokens_seen": 397806592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004441624874623872, + "loss": 3.1163, + "theoretical_loss": 4.01434743081972, + "tokens_seen": 397872128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004441524573721163, + "loss": 3.1802, + "theoretical_loss": 4.01427339443831, + "tokens_seen": 397937664 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044414242728184556, + "loss": 3.19, + "theoretical_loss": 4.014199373662277, + "tokens_seen": 398003200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044413239719157474, + "loss": 3.2034, + "theoretical_loss": 4.014125368485762, + "tokens_seen": 398068736 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 659073, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.104768991470337, + "objective/train/theoretical_loss": 4.014069874837038, + "objective/train/tokens_used": 418577888, + "theoretical_loss": 4.014069874837038, + "tokens_seen": 398117888 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004441223671013039, + "loss": 3.2066, + "theoretical_loss": 4.014051378902911, + "tokens_seen": 398134272 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004441123370110331, + "loss": 3.0634, + "theoretical_loss": 4.013977404907873, + "tokens_seen": 398199808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044410230692076234, + "loss": 3.1392, + "theoretical_loss": 4.0139034464947985, + "tokens_seen": 398265344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044409227683049146, + "loss": 3.1451, + "theoretical_loss": 4.013829503657842, + "tokens_seen": 398330880 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440822467402207, + "loss": 3.0473, + "theoretical_loss": 4.013755576391161, + "tokens_seen": 398396416 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440722166499498, + "loss": 3.1116, + "theoretical_loss": 4.013681664688917, + "tokens_seen": 398461952 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044406218655967906, + "loss": 3.0242, + "theoretical_loss": 4.013607768545274, + "tokens_seen": 398527488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044405215646940824, + "loss": 3.2772, + "theoretical_loss": 4.013533887954399, + "tokens_seen": 398593024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440421263791374, + "loss": 3.1303, + "theoretical_loss": 4.013460022910461, + "tokens_seen": 398658560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440320962888666, + "loss": 3.3397, + "theoretical_loss": 4.013386173407636, + "tokens_seen": 398724096 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440220661985958, + "loss": 3.1866, + "theoretical_loss": 4.013312339440099, + "tokens_seen": 398789632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044401203610832497, + "loss": 3.2401, + "theoretical_loss": 4.013238521002029, + "tokens_seen": 398855168 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004440020060180542, + "loss": 3.1803, + "theoretical_loss": 4.01316471808761, + "tokens_seen": 398920704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044399197592778333, + "loss": 3.2189, + "theoretical_loss": 4.013090930691028, + "tokens_seen": 398986240 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044398194583751256, + "loss": 3.1827, + "theoretical_loss": 4.0130171588064725, + "tokens_seen": 399051776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004439719157472417, + "loss": 3.2207, + "theoretical_loss": 4.012943402428134, + "tokens_seen": 399117312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004439618856569709, + "loss": 3.0937, + "theoretical_loss": 4.0128696615502095, + "tokens_seen": 399182848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004439518555667001, + "loss": 3.0878, + "theoretical_loss": 4.012795936166897, + "tokens_seen": 399248384 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004439418254764293, + "loss": 3.0338, + "theoretical_loss": 4.012722226272397, + "tokens_seen": 399313920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044393179538615847, + "loss": 3.1021, + "theoretical_loss": 4.012648531860917, + "tokens_seen": 399379456 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004439217652958877, + "loss": 3.2216, + "theoretical_loss": 4.012574852926662, + "tokens_seen": 399444992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044391173520561683, + "loss": 3.2403, + "theoretical_loss": 4.012501189463843, + "tokens_seen": 399510528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044390170511534607, + "loss": 3.1463, + "theoretical_loss": 4.012427541466677, + "tokens_seen": 399576064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004438916750250752, + "loss": 3.1034, + "theoretical_loss": 4.012353908929379, + "tokens_seen": 399641600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044388164493480443, + "loss": 3.0787, + "theoretical_loss": 4.012280291846169, + "tokens_seen": 399707136 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 661929, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.228034019470215, + "objective/train/theoretical_loss": 4.012225089172033, + "objective/train/tokens_used": 420216288, + "theoretical_loss": 4.012225089172033, + "tokens_seen": 399756288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004438716148445336, + "loss": 3.133, + "theoretical_loss": 4.012206690211272, + "tokens_seen": 399772672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004438615847542628, + "loss": 3.1495, + "theoretical_loss": 4.012133104018914, + "tokens_seen": 399838208 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044385155466399197, + "loss": 3.1181, + "theoretical_loss": 4.012059533263323, + "tokens_seen": 399903744 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044384152457372115, + "loss": 3.2695, + "theoretical_loss": 4.011985977938735, + "tokens_seen": 399969280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044383149448345033, + "loss": 3.0268, + "theoretical_loss": 4.011912438039381, + "tokens_seen": 400034816 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044382146439317957, + "loss": 3.1714, + "theoretical_loss": 4.011838913559505, + "tokens_seen": 400100352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004438114343029087, + "loss": 3.2432, + "theoretical_loss": 4.011765404493346, + "tokens_seen": 400165888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044380140421263793, + "loss": 3.2439, + "theoretical_loss": 4.01169191083515, + "tokens_seen": 400231424 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004437913741223671, + "loss": 3.1407, + "theoretical_loss": 4.011618432579166, + "tokens_seen": 400296960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004437813440320963, + "loss": 3.2621, + "theoretical_loss": 4.011544969719644, + "tokens_seen": 400362496 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004437713139418255, + "loss": 3.2348, + "theoretical_loss": 4.011471522250838, + "tokens_seen": 400428032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044376128385155466, + "loss": 3.2594, + "theoretical_loss": 4.011398090167007, + "tokens_seen": 400493568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044375125376128384, + "loss": 3.1102, + "theoretical_loss": 4.011324673462411, + "tokens_seen": 400559104 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044374122367101307, + "loss": 3.163, + "theoretical_loss": 4.011251272131313, + "tokens_seen": 400624640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004437311935807422, + "loss": 3.1257, + "theoretical_loss": 4.01117788616798, + "tokens_seen": 400690176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044372116349047143, + "loss": 2.9543, + "theoretical_loss": 4.011104515566682, + "tokens_seen": 400755712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044371113340020056, + "loss": 3.2814, + "theoretical_loss": 4.011031160321693, + "tokens_seen": 400821248 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004437011033099298, + "loss": 3.1526, + "theoretical_loss": 4.010957820427286, + "tokens_seen": 400886784 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044369107321965903, + "loss": 3.0704, + "theoretical_loss": 4.010884495877743, + "tokens_seen": 400952320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044368104312938816, + "loss": 3.1344, + "theoretical_loss": 4.010811186667344, + "tokens_seen": 401017856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004436710130391174, + "loss": 3.0813, + "theoretical_loss": 4.010737892790376, + "tokens_seen": 401083392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004436609829488465, + "loss": 3.0936, + "theoretical_loss": 4.010664614241124, + "tokens_seen": 401148928 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044365095285857576, + "loss": 3.2651, + "theoretical_loss": 4.010591351013883, + "tokens_seen": 401214464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044364092276830494, + "loss": 2.9969, + "theoretical_loss": 4.010518103102945, + "tokens_seen": 401280000 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004436308926780341, + "loss": 3.2136, + "theoretical_loss": 4.010444870502608, + "tokens_seen": 401345536 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 664949, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9795851707458496, + "objective/train/theoretical_loss": 4.010389956096509, + "objective/train/tokens_used": 421854688, + "theoretical_loss": 4.010389956096509, + "tokens_seen": 401394688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004436208625877633, + "loss": 3.1875, + "theoretical_loss": 4.010371653207173, + "tokens_seen": 401411072 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044361083249749254, + "loss": 3.2205, + "theoretical_loss": 4.010298451210942, + "tokens_seen": 401476608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044360080240722166, + "loss": 3.1087, + "theoretical_loss": 4.010225264508223, + "tokens_seen": 401542144 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004435907723169509, + "loss": 3.2692, + "theoretical_loss": 4.010152093093325, + "tokens_seen": 401607680 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044358074222668, + "loss": 3.246, + "theoretical_loss": 4.01007893696056, + "tokens_seen": 401673216 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044357071213640926, + "loss": 2.9391, + "theoretical_loss": 4.010005796104245, + "tokens_seen": 401738752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044356068204613844, + "loss": 3.0966, + "theoretical_loss": 4.0099326705186975, + "tokens_seen": 401804288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004435506519558676, + "loss": 3.1343, + "theoretical_loss": 4.009859560198239, + "tokens_seen": 401869824 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004435406218655968, + "loss": 3.225, + "theoretical_loss": 4.0097864651371955, + "tokens_seen": 401935360 + }, + { + "epoch": 1.02, + "learning_rate": 0.000443530591775326, + "loss": 3.229, + "theoretical_loss": 4.009713385329894, + "tokens_seen": 402000896 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044352056168505517, + "loss": 3.3355, + "theoretical_loss": 4.009640320770666, + "tokens_seen": 402066432 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004435105315947844, + "loss": 3.1945, + "theoretical_loss": 4.009567271453845, + "tokens_seen": 402131968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044350050150451353, + "loss": 3.1251, + "theoretical_loss": 4.009494237373768, + "tokens_seen": 402197504 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044349047141424276, + "loss": 3.2294, + "theoretical_loss": 4.009421218524774, + "tokens_seen": 402263040 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434804413239719, + "loss": 3.1429, + "theoretical_loss": 4.009348214901207, + "tokens_seen": 402328576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434704112337011, + "loss": 3.0846, + "theoretical_loss": 4.0092752264974125, + "tokens_seen": 402394112 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434603811434303, + "loss": 3.2589, + "theoretical_loss": 4.00920225330774, + "tokens_seen": 402459648 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434503510531595, + "loss": 3.2282, + "theoretical_loss": 4.009129295326542, + "tokens_seen": 402525184 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044344032096288867, + "loss": 3.2625, + "theoretical_loss": 4.009056352548171, + "tokens_seen": 402590720 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434302908726179, + "loss": 3.3743, + "theoretical_loss": 4.008983424966988, + "tokens_seen": 402656256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044342026078234703, + "loss": 3.181, + "theoretical_loss": 4.008910512577351, + "tokens_seen": 402721792 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044341023069207627, + "loss": 3.1035, + "theoretical_loss": 4.008837615373627, + "tokens_seen": 402787328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004434002006018054, + "loss": 3.234, + "theoretical_loss": 4.008764733350183, + "tokens_seen": 402852864 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044339017051153463, + "loss": 3.3051, + "theoretical_loss": 4.008691866501387, + "tokens_seen": 402918400 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004433801404212638, + "loss": 3.0766, + "theoretical_loss": 4.008619014821613, + "tokens_seen": 402983936 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 667718, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0787782669067383, + "objective/train/theoretical_loss": 4.008564386013069, + "objective/train/tokens_used": 423493088, + "theoretical_loss": 4.008564386013069, + "tokens_seen": 403033088 + }, + { + "epoch": 1.02, + "learning_rate": 0.000443370110330993, + "loss": 3.1563, + "theoretical_loss": 4.008546178305236, + "tokens_seen": 403049472 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044336008024072217, + "loss": 3.1707, + "theoretical_loss": 4.008473356946638, + "tokens_seen": 403115008 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044335005015045135, + "loss": 3.1852, + "theoretical_loss": 4.008400550740198, + "tokens_seen": 403180544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044334002006018053, + "loss": 3.047, + "theoretical_loss": 4.008327759680304, + "tokens_seen": 403246080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044332998996990977, + "loss": 3.3164, + "theoretical_loss": 4.008254983761341, + "tokens_seen": 403311616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004433199598796389, + "loss": 3.1911, + "theoretical_loss": 4.008182222977702, + "tokens_seen": 403377152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044330992978936813, + "loss": 3.1154, + "theoretical_loss": 4.00810947732378, + "tokens_seen": 403442688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004432998996990973, + "loss": 3.0319, + "theoretical_loss": 4.008036746793973, + "tokens_seen": 403508224 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004432898696088265, + "loss": 3.1542, + "theoretical_loss": 4.007964031382681, + "tokens_seen": 403573760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004432798395185557, + "loss": 3.3206, + "theoretical_loss": 4.007891331084306, + "tokens_seen": 403639296 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044326980942828486, + "loss": 3.1936, + "theoretical_loss": 4.007818645893254, + "tokens_seen": 403704832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044325977933801404, + "loss": 3.1803, + "theoretical_loss": 4.007745975803934, + "tokens_seen": 403770368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044324974924774327, + "loss": 3.2156, + "theoretical_loss": 4.00767332081076, + "tokens_seen": 403835904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004432397191574724, + "loss": 3.0932, + "theoretical_loss": 4.007600680908144, + "tokens_seen": 403901440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044322968906720163, + "loss": 3.1284, + "theoretical_loss": 4.007528056090505, + "tokens_seen": 403966976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044321965897693076, + "loss": 3.1204, + "theoretical_loss": 4.007455446352266, + "tokens_seen": 404032512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044320962888666, + "loss": 3.3074, + "theoretical_loss": 4.007382851687847, + "tokens_seen": 404098048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004431995987963892, + "loss": 3.09, + "theoretical_loss": 4.007310272091677, + "tokens_seen": 404163584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044318956870611836, + "loss": 3.231, + "theoretical_loss": 4.007237707558185, + "tokens_seen": 404229120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044317953861584754, + "loss": 3.1677, + "theoretical_loss": 4.007165158081804, + "tokens_seen": 404294656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004431695085255767, + "loss": 3.2036, + "theoretical_loss": 4.007092623656971, + "tokens_seen": 404360192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004431594784353059, + "loss": 3.2642, + "theoretical_loss": 4.007020104278122, + "tokens_seen": 404425728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044314944834503514, + "loss": 3.1629, + "theoretical_loss": 4.0069475999397, + "tokens_seen": 404491264 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044313941825476427, + "loss": 3.1261, + "theoretical_loss": 4.00687511063615, + "tokens_seen": 404556800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004431293881644935, + "loss": 3.2923, + "theoretical_loss": 4.006802636361918, + "tokens_seen": 404622336 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 670599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0098390579223633, + "objective/train/theoretical_loss": 4.0067482905158975, + "objective/train/tokens_used": 425131488, + "theoretical_loss": 4.0067482905158975, + "tokens_seen": 404671488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004431193580742227, + "loss": 3.1329, + "theoretical_loss": 4.006730177111456, + "tokens_seen": 404687872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044310932798395186, + "loss": 3.1087, + "theoretical_loss": 4.0066577328792174, + "tokens_seen": 404753408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044309929789368104, + "loss": 3.1998, + "theoretical_loss": 4.006585303659657, + "tokens_seen": 404818944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004430892678034102, + "loss": 3.0798, + "theoretical_loss": 4.006512889447235, + "tokens_seen": 404884480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004430792377131394, + "loss": 3.2126, + "theoretical_loss": 4.006440490236414, + "tokens_seen": 404950016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044306920762286864, + "loss": 3.2529, + "theoretical_loss": 4.006368106021657, + "tokens_seen": 405015552 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044305917753259777, + "loss": 3.1097, + "theoretical_loss": 4.006295736797436, + "tokens_seen": 405081088 + }, + { + "epoch": 1.02, + "learning_rate": 0.000443049147442327, + "loss": 3.0051, + "theoretical_loss": 4.006223382558218, + "tokens_seen": 405146624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044303911735205613, + "loss": 3.2149, + "theoretical_loss": 4.00615104329848, + "tokens_seen": 405212160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044302908726178537, + "loss": 2.9111, + "theoretical_loss": 4.006078719012697, + "tokens_seen": 405277696 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044301905717151455, + "loss": 3.2494, + "theoretical_loss": 4.00600640969535, + "tokens_seen": 405343232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044300902708124373, + "loss": 3.1076, + "theoretical_loss": 4.005934115340921, + "tokens_seen": 405408768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004429989969909729, + "loss": 3.3117, + "theoretical_loss": 4.005861835943895, + "tokens_seen": 405474304 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004429889669007021, + "loss": 3.0727, + "theoretical_loss": 4.005789571498761, + "tokens_seen": 405539840 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044297893681043127, + "loss": 3.261, + "theoretical_loss": 4.005717322000012, + "tokens_seen": 405605376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004429689067201605, + "loss": 3.1788, + "theoretical_loss": 4.005645087442142, + "tokens_seen": 405670912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044295887662988963, + "loss": 3.0285, + "theoretical_loss": 4.005572867819646, + "tokens_seen": 405736448 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044294884653961887, + "loss": 3.2805, + "theoretical_loss": 4.0055006631270285, + "tokens_seen": 405801984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004429388164493481, + "loss": 3.0515, + "theoretical_loss": 4.005428473358788, + "tokens_seen": 405867520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044292878635907723, + "loss": 3.1008, + "theoretical_loss": 4.005356298509433, + "tokens_seen": 405933056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044291875626880647, + "loss": 3.0333, + "theoretical_loss": 4.005284138573473, + "tokens_seen": 405998592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004429087261785356, + "loss": 3.3087, + "theoretical_loss": 4.0052119935454185, + "tokens_seen": 406064128 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044289869608826483, + "loss": 3.1272, + "theoretical_loss": 4.005139863419785, + "tokens_seen": 406129664 + }, + { + "epoch": 1.02, + "learning_rate": 0.000442888665997994, + "loss": 3.3728, + "theoretical_loss": 4.00506774819109, + "tokens_seen": 406195200 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004428786359077232, + "loss": 3.1563, + "theoretical_loss": 4.004995647853855, + "tokens_seen": 406260736 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 673420, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.262136459350586, + "objective/train/theoretical_loss": 4.004941582370154, + "objective/train/tokens_used": 426769888, + "theoretical_loss": 4.004941582370154, + "tokens_seen": 406309888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044286860581745237, + "loss": 3.1771, + "theoretical_loss": 4.0049235624026025, + "tokens_seen": 406326272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044285857572718155, + "loss": 3.2467, + "theoretical_loss": 4.004851491831859, + "tokens_seen": 406391808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044284854563691073, + "loss": 3.3503, + "theoretical_loss": 4.004779436136154, + "tokens_seen": 406457344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044283851554663997, + "loss": 3.1805, + "theoretical_loss": 4.004707395310019, + "tokens_seen": 406522880 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004428284854563691, + "loss": 3.1013, + "theoretical_loss": 4.004635369347991, + "tokens_seen": 406588416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044281845536609833, + "loss": 3.1699, + "theoretical_loss": 4.0045633582446065, + "tokens_seen": 406653952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004428084252758275, + "loss": 3.2766, + "theoretical_loss": 4.004491361994406, + "tokens_seen": 406719488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004427983951855567, + "loss": 3.1611, + "theoretical_loss": 4.0044193805919335, + "tokens_seen": 406785024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004427883650952859, + "loss": 3.1342, + "theoretical_loss": 4.004347414031736, + "tokens_seen": 406850560 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044277833500501506, + "loss": 3.2487, + "theoretical_loss": 4.004275462308364, + "tokens_seen": 406916096 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044276830491474424, + "loss": 3.0841, + "theoretical_loss": 4.004203525416369, + "tokens_seen": 406981632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044275827482447347, + "loss": 3.1323, + "theoretical_loss": 4.004131603350305, + "tokens_seen": 407047168 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004427482447342026, + "loss": 3.2235, + "theoretical_loss": 4.004059696104732, + "tokens_seen": 407112704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044273821464393184, + "loss": 3.0451, + "theoretical_loss": 4.003987803674209, + "tokens_seen": 407178240 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044272818455366096, + "loss": 3.0954, + "theoretical_loss": 4.003915926053303, + "tokens_seen": 407243776 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004427181544633902, + "loss": 2.9985, + "theoretical_loss": 4.003844063236578, + "tokens_seen": 407309312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004427081243731194, + "loss": 3.2108, + "theoretical_loss": 4.003772215218604, + "tokens_seen": 407374848 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044269809428284856, + "loss": 3.0903, + "theoretical_loss": 4.003700381993955, + "tokens_seen": 407440384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044268806419257774, + "loss": 3.1488, + "theoretical_loss": 4.003628563557205, + "tokens_seen": 407505920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004426780341023069, + "loss": 3.434, + "theoretical_loss": 4.003556759902933, + "tokens_seen": 407571456 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004426680040120361, + "loss": 3.1308, + "theoretical_loss": 4.00348497102572, + "tokens_seen": 407636992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044265797392176534, + "loss": 3.1032, + "theoretical_loss": 4.003413196920148, + "tokens_seen": 407702528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044264794383149447, + "loss": 3.1383, + "theoretical_loss": 4.003341437580806, + "tokens_seen": 407768064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004426379137412237, + "loss": 3.22, + "theoretical_loss": 4.0032696930022835, + "tokens_seen": 407833600 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004426278836509529, + "loss": 3.1651, + "theoretical_loss": 4.003197963179172, + "tokens_seen": 407899136 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 674928, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1407558917999268, + "objective/train/theoretical_loss": 4.003144175491826, + "objective/train/tokens_used": 428408288, + "theoretical_loss": 4.003144175491826, + "tokens_seen": 407948288 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044261785356068206, + "loss": 3.0899, + "theoretical_loss": 4.003126248106068, + "tokens_seen": 407964672 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044260782347041124, + "loss": 3.2377, + "theoretical_loss": 4.003054547777569, + "tokens_seen": 408030208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004425977933801404, + "loss": 3.1762, + "theoretical_loss": 4.002982862188276, + "tokens_seen": 408095744 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004425877632898696, + "loss": 3.2694, + "theoretical_loss": 4.002911191332792, + "tokens_seen": 408161280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044257773319959884, + "loss": 3.1849, + "theoretical_loss": 4.002839535205725, + "tokens_seen": 408226816 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044256770310932797, + "loss": 3.0997, + "theoretical_loss": 4.002767893801685, + "tokens_seen": 408292352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004425576730190572, + "loss": 3.1077, + "theoretical_loss": 4.002696267115282, + "tokens_seen": 408357888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044254764292878633, + "loss": 3.2707, + "theoretical_loss": 4.002624655141134, + "tokens_seen": 408423424 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044253761283851557, + "loss": 3.0962, + "theoretical_loss": 4.0025530578738575, + "tokens_seen": 408488960 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044252758274824475, + "loss": 3.1818, + "theoretical_loss": 4.002481475308074, + "tokens_seen": 408554496 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044251755265797393, + "loss": 3.1194, + "theoretical_loss": 4.002409907438407, + "tokens_seen": 408620032 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004425075225677031, + "loss": 3.1363, + "theoretical_loss": 4.002338354259483, + "tokens_seen": 408685568 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004424974924774323, + "loss": 3.1255, + "theoretical_loss": 4.002266815765931, + "tokens_seen": 408751104 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044248746238716147, + "loss": 3.1597, + "theoretical_loss": 4.002195291952384, + "tokens_seen": 408816640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004424774322968907, + "loss": 3.3395, + "theoretical_loss": 4.002123782813476, + "tokens_seen": 408882176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044246740220661983, + "loss": 3.268, + "theoretical_loss": 4.0020522883438465, + "tokens_seen": 408947712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044245737211634907, + "loss": 3.1508, + "theoretical_loss": 4.001980808538135, + "tokens_seen": 409013248 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044244734202607825, + "loss": 3.1252, + "theoretical_loss": 4.0019093433909845, + "tokens_seen": 409078784 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044243731193580743, + "loss": 3.0447, + "theoretical_loss": 4.001837892897042, + "tokens_seen": 409144320 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004424272818455366, + "loss": 3.1921, + "theoretical_loss": 4.001766457050957, + "tokens_seen": 409209856 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004424172517552658, + "loss": 3.2355, + "theoretical_loss": 4.00169503584738, + "tokens_seen": 409275392 + }, + { + "epoch": 1.02, + "learning_rate": 0.000442407221664995, + "loss": 3.1056, + "theoretical_loss": 4.001623629280967, + "tokens_seen": 409340928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423971915747242, + "loss": 3.2038, + "theoretical_loss": 4.001552237346376, + "tokens_seen": 409406464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044238716148445334, + "loss": 3.2191, + "theoretical_loss": 4.001480860038265, + "tokens_seen": 409472000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044237713139418257, + "loss": 3.1494, + "theoretical_loss": 4.0014094973513, + "tokens_seen": 409537536 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 678633, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.191779136657715, + "objective/train/theoretical_loss": 4.001355984927994, + "objective/train/tokens_used": 430046688, + "theoretical_loss": 4.001355984927994, + "tokens_seen": 409586688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423671013039117, + "loss": 3.2908, + "theoretical_loss": 4.001338149280146, + "tokens_seen": 409603072 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044235707121364093, + "loss": 3.1, + "theoretical_loss": 4.00126681581947, + "tokens_seen": 409668608 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423470411233701, + "loss": 3.0167, + "theoretical_loss": 4.001195496963946, + "tokens_seen": 409734144 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423370110330993, + "loss": 2.9814, + "theoretical_loss": 4.001124192708247, + "tokens_seen": 409799680 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423269809428285, + "loss": 3.2118, + "theoretical_loss": 4.001052903047049, + "tokens_seen": 409865216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004423169508525577, + "loss": 3.14, + "theoretical_loss": 4.000981627975034, + "tokens_seen": 409930752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044230692076228684, + "loss": 3.1289, + "theoretical_loss": 4.000910367486885, + "tokens_seen": 409996288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422968906720161, + "loss": 3.3336, + "theoretical_loss": 4.000839121577285, + "tokens_seen": 410061824 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422868605817452, + "loss": 2.9782, + "theoretical_loss": 4.000767890240924, + "tokens_seen": 410127360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044227683049147444, + "loss": 3.1651, + "theoretical_loss": 4.000696673472493, + "tokens_seen": 410192896 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422668004012036, + "loss": 3.0427, + "theoretical_loss": 4.0006254712666856, + "tokens_seen": 410258432 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422567703109328, + "loss": 3.2119, + "theoretical_loss": 4.000554283618198, + "tokens_seen": 410323968 + }, + { + "epoch": 1.02, + "learning_rate": 0.000442246740220662, + "loss": 3.0618, + "theoretical_loss": 4.000483110521731, + "tokens_seen": 410389504 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044223671013039116, + "loss": 3.0682, + "theoretical_loss": 4.000411951971985, + "tokens_seen": 410455040 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044222668004012034, + "loss": 3.1118, + "theoretical_loss": 4.000340807963666, + "tokens_seen": 410520576 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422166499498496, + "loss": 3.2237, + "theoretical_loss": 4.000269678491482, + "tokens_seen": 410586112 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004422066198595787, + "loss": 3.1674, + "theoretical_loss": 4.000198563550143, + "tokens_seen": 410651648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044219658976930794, + "loss": 3.1102, + "theoretical_loss": 4.000127463134361, + "tokens_seen": 410717184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004421865596790371, + "loss": 3.1728, + "theoretical_loss": 4.000056377238854, + "tokens_seen": 410782720 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004421765295887663, + "loss": 3.0402, + "theoretical_loss": 3.99998530585834, + "tokens_seen": 410848256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044216649949849554, + "loss": 3.2421, + "theoretical_loss": 3.999914248987541, + "tokens_seen": 410913792 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044215646940822467, + "loss": 3.0587, + "theoretical_loss": 3.999843206621181, + "tokens_seen": 410979328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004421464393179539, + "loss": 3.2606, + "theoretical_loss": 3.999772178753987, + "tokens_seen": 411044864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004421364092276831, + "loss": 3.1867, + "theoretical_loss": 3.999701165380688, + "tokens_seen": 411110400 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044212637913741226, + "loss": 3.2511, + "theoretical_loss": 3.9996301664960185, + "tokens_seen": 411175936 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 679975, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1327946186065674, + "objective/train/theoretical_loss": 3.999576926837511, + "objective/train/tokens_used": 431685088, + "theoretical_loss": 3.999576926837511, + "tokens_seen": 411225088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044211634904714144, + "loss": 3.2176, + "theoretical_loss": 3.9995591820947123, + "tokens_seen": 411241472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004421063189568706, + "loss": 3.0818, + "theoretical_loss": 3.9994882121715083, + "tokens_seen": 411307008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004420962888665998, + "loss": 3.038, + "theoretical_loss": 3.999417256721147, + "tokens_seen": 411372544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044208625877632904, + "loss": 3.1944, + "theoretical_loss": 3.9993463157383715, + "tokens_seen": 411438080 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044207622868605817, + "loss": 3.2266, + "theoretical_loss": 3.999275389217929, + "tokens_seen": 411503616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004420661985957874, + "loss": 3.0343, + "theoretical_loss": 3.999204477154568, + "tokens_seen": 411569152 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044205616850551653, + "loss": 3.2625, + "theoretical_loss": 3.9991335795430407, + "tokens_seen": 411634688 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044204613841524577, + "loss": 2.9818, + "theoretical_loss": 3.9990626963781017, + "tokens_seen": 411700224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044203610832497495, + "loss": 3.023, + "theoretical_loss": 3.998991827654508, + "tokens_seen": 411765760 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044202607823470413, + "loss": 3.1603, + "theoretical_loss": 3.99892097336702, + "tokens_seen": 411831296 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004420160481444333, + "loss": 3.0499, + "theoretical_loss": 3.9988501335104, + "tokens_seen": 411896832 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004420060180541625, + "loss": 3.1674, + "theoretical_loss": 3.9987793080794134, + "tokens_seen": 411962368 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044199598796389167, + "loss": 3.2409, + "theoretical_loss": 3.9987084970688294, + "tokens_seen": 412027904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004419859578736209, + "loss": 3.2742, + "theoretical_loss": 3.9986377004734184, + "tokens_seen": 412093440 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044197592778335003, + "loss": 3.0811, + "theoretical_loss": 3.9985669182879535, + "tokens_seen": 412158976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044196589769307927, + "loss": 3.1495, + "theoretical_loss": 3.9984961505072123, + "tokens_seen": 412224512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044195586760280845, + "loss": 2.9383, + "theoretical_loss": 3.998425397125973, + "tokens_seen": 412290048 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044194583751253763, + "loss": 3.09, + "theoretical_loss": 3.998354658139018, + "tokens_seen": 412355584 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004419358074222668, + "loss": 3.128, + "theoretical_loss": 3.9982839335411313, + "tokens_seen": 412421120 + }, + { + "epoch": 1.02, + "learning_rate": 0.000441925777331996, + "loss": 3.2105, + "theoretical_loss": 3.9982132233271006, + "tokens_seen": 412486656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004419157472417252, + "loss": 2.9994, + "theoretical_loss": 3.9981425274917166, + "tokens_seen": 412552192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004419057171514544, + "loss": 2.9679, + "theoretical_loss": 3.998071846029771, + "tokens_seen": 412617728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044189568706118354, + "loss": 3.2731, + "theoretical_loss": 3.998001178936059, + "tokens_seen": 412683264 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044188565697091277, + "loss": 3.0729, + "theoretical_loss": 3.99793052620538, + "tokens_seen": 412748800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418756268806419, + "loss": 3.3009, + "theoretical_loss": 3.9978598878325338, + "tokens_seen": 412814336 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 682814, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.242856740951538, + "objective/train/theoretical_loss": 3.9978069184721012, + "objective/train/tokens_used": 433323488, + "theoretical_loss": 3.9978069184721012, + "tokens_seen": 412863488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044186559679037113, + "loss": 3.0449, + "theoretical_loss": 3.997789263812325, + "tokens_seen": 412879872 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418555667001003, + "loss": 2.9991, + "theoretical_loss": 3.9977186541395584, + "tokens_seen": 412945408 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418455366098295, + "loss": 3.1094, + "theoretical_loss": 3.9976480588090446, + "tokens_seen": 413010944 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418355065195587, + "loss": 3.2147, + "theoretical_loss": 3.997577477815594, + "tokens_seen": 413076480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418254764292879, + "loss": 3.1206, + "theoretical_loss": 3.997506911154022, + "tokens_seen": 413142016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044181544633901704, + "loss": 3.1495, + "theoretical_loss": 3.9974363588191446, + "tokens_seen": 413207552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004418054162487463, + "loss": 3.2226, + "theoretical_loss": 3.9973658208057827, + "tokens_seen": 413273088 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004417953861584754, + "loss": 3.1155, + "theoretical_loss": 3.997295297108758, + "tokens_seen": 413338624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044178535606820464, + "loss": 3.016, + "theoretical_loss": 3.997224787722896, + "tokens_seen": 413404160 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004417753259779338, + "loss": 3.1656, + "theoretical_loss": 3.9971542926430246, + "tokens_seen": 413469696 + }, + { + "epoch": 1.02, + "learning_rate": 0.000441765295887663, + "loss": 3.0347, + "theoretical_loss": 3.9970838118639733, + "tokens_seen": 413535232 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004417552657973922, + "loss": 3.0662, + "theoretical_loss": 3.9970133453805774, + "tokens_seen": 413600768 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044174523570712136, + "loss": 3.2027, + "theoretical_loss": 3.996942893187671, + "tokens_seen": 413666304 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044173520561685054, + "loss": 3.1131, + "theoretical_loss": 3.9968724552800934, + "tokens_seen": 413731840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004417251755265798, + "loss": 3.1739, + "theoretical_loss": 3.9968020316526855, + "tokens_seen": 413797376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004417151454363089, + "loss": 3.0899, + "theoretical_loss": 3.9967316223002918, + "tokens_seen": 413862912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044170511534603814, + "loss": 3.1454, + "theoretical_loss": 3.9966612272177593, + "tokens_seen": 413928448 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044169508525576727, + "loss": 3.2687, + "theoretical_loss": 3.9965908463999362, + "tokens_seen": 413993984 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004416850551654965, + "loss": 3.0659, + "theoretical_loss": 3.996520479841675, + "tokens_seen": 414059520 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004416750250752257, + "loss": 3.2851, + "theoretical_loss": 3.9964501275378304, + "tokens_seen": 414125056 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044166499498495487, + "loss": 3.2863, + "theoretical_loss": 3.9963797894832602, + "tokens_seen": 414190592 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044165496489468405, + "loss": 3.1496, + "theoretical_loss": 3.9963094656728235, + "tokens_seen": 414256128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004416449348044133, + "loss": 3.044, + "theoretical_loss": 3.9962391561013826, + "tokens_seen": 414321664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004416349047141424, + "loss": 3.1411, + "theoretical_loss": 3.996168860763805, + "tokens_seen": 414387200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044162487462387164, + "loss": 3.1838, + "theoretical_loss": 3.9960985796549564, + "tokens_seen": 414452736 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 685529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0357508659362793, + "objective/train/theoretical_loss": 3.9960458781578385, + "objective/train/tokens_used": 434961888, + "theoretical_loss": 3.9960458781578385, + "tokens_seen": 414501888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044161484453360077, + "loss": 3.2674, + "theoretical_loss": 3.9960283127697087, + "tokens_seen": 414518272 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044160481444333, + "loss": 3.1129, + "theoretical_loss": 3.9959580601029345, + "tokens_seen": 414583808 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004415947843530592, + "loss": 3.309, + "theoretical_loss": 3.99588782164951, + "tokens_seen": 414649344 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044158475426278837, + "loss": 3.1665, + "theoretical_loss": 3.9958175974043146, + "tokens_seen": 414714880 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044157472417251755, + "loss": 3.2184, + "theoretical_loss": 3.9957473873622287, + "tokens_seen": 414780416 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044156469408224673, + "loss": 3.237, + "theoretical_loss": 3.995677191518136, + "tokens_seen": 414845952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004415546639919759, + "loss": 3.0982, + "theoretical_loss": 3.9956070098669243, + "tokens_seen": 414911488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044154463390170515, + "loss": 3.1422, + "theoretical_loss": 3.9955368424034816, + "tokens_seen": 414977024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004415346038114343, + "loss": 3.0775, + "theoretical_loss": 3.9954666891227, + "tokens_seen": 415042560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004415245737211635, + "loss": 3.2786, + "theoretical_loss": 3.995396550019475, + "tokens_seen": 415108096 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044151454363089264, + "loss": 3.1665, + "theoretical_loss": 3.995326425088703, + "tokens_seen": 415173632 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044150451354062187, + "loss": 3.2386, + "theoretical_loss": 3.9952563143252835, + "tokens_seen": 415239168 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044149448345035105, + "loss": 3.2035, + "theoretical_loss": 3.995186217724119, + "tokens_seen": 415304704 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044148445336008023, + "loss": 3.1193, + "theoretical_loss": 3.9951161352801154, + "tokens_seen": 415370240 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004414744232698094, + "loss": 3.2242, + "theoretical_loss": 3.99504606698818, + "tokens_seen": 415435776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044146439317953865, + "loss": 3.1895, + "theoretical_loss": 3.9949760128432232, + "tokens_seen": 415501312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004414543630892678, + "loss": 3.0362, + "theoretical_loss": 3.994905972840158, + "tokens_seen": 415566848 + }, + { + "epoch": 1.02, + "learning_rate": 0.000441444332998997, + "loss": 3.1484, + "theoretical_loss": 3.9948359469738994, + "tokens_seen": 415632384 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004414343029087262, + "loss": 2.9682, + "theoretical_loss": 3.994765935239367, + "tokens_seen": 415697920 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004414242728184554, + "loss": 3.1354, + "theoretical_loss": 3.9946959376314797, + "tokens_seen": 415763456 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004414142427281846, + "loss": 3.1788, + "theoretical_loss": 3.994625954145163, + "tokens_seen": 415828992 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044140421263791374, + "loss": 3.0868, + "theoretical_loss": 3.9945559847753422, + "tokens_seen": 415894528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044139418254764297, + "loss": 3.0675, + "theoretical_loss": 3.9944860295169455, + "tokens_seen": 415960064 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413841524573721, + "loss": 3.152, + "theoretical_loss": 3.9944160883649054, + "tokens_seen": 416025600 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044137412236710133, + "loss": 3.2197, + "theoretical_loss": 3.994346161314155, + "tokens_seen": 416091136 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 688429, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.343158006668091, + "objective/train/theoretical_loss": 3.994293725277018, + "objective/train/tokens_used": 436600288, + "theoretical_loss": 3.994293725277018, + "tokens_seen": 416140288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413640922768305, + "loss": 3.1656, + "theoretical_loss": 3.9942762483596312, + "tokens_seen": 416156672 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413540621865597, + "loss": 3.0511, + "theoretical_loss": 3.994206349496274, + "tokens_seen": 416222208 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413440320962889, + "loss": 3.2489, + "theoretical_loss": 3.9941364647190234, + "tokens_seen": 416287744 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413340020060181, + "loss": 3.2026, + "theoretical_loss": 3.9940665940228257, + "tokens_seen": 416353280 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044132397191574724, + "loss": 3.2365, + "theoretical_loss": 3.993996737402627, + "tokens_seen": 416418816 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413139418254765, + "loss": 3.2711, + "theoretical_loss": 3.9939268948533773, + "tokens_seen": 416484352 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004413039117352056, + "loss": 3.2677, + "theoretical_loss": 3.9938570663700284, + "tokens_seen": 416549888 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044129388164493484, + "loss": 3.1479, + "theoretical_loss": 3.993787251947536, + "tokens_seen": 416615424 + }, + { + "epoch": 1.02, + "learning_rate": 0.000441283851554664, + "loss": 3.1233, + "theoretical_loss": 3.9937174515808564, + "tokens_seen": 416680960 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004412738214643932, + "loss": 2.9685, + "theoretical_loss": 3.993647665264951, + "tokens_seen": 416746496 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004412637913741224, + "loss": 3.1239, + "theoretical_loss": 3.9935778929947814, + "tokens_seen": 416812032 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044125376128385156, + "loss": 3.3788, + "theoretical_loss": 3.993508134765314, + "tokens_seen": 416877568 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044124373119358074, + "loss": 3.1574, + "theoretical_loss": 3.9934383905715154, + "tokens_seen": 416943104 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044123370110331, + "loss": 3.1066, + "theoretical_loss": 3.9933686604083576, + "tokens_seen": 417008640 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004412236710130391, + "loss": 3.275, + "theoretical_loss": 3.993298944270812, + "tokens_seen": 417074176 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044121364092276834, + "loss": 3.2613, + "theoretical_loss": 3.993229242153855, + "tokens_seen": 417139712 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044120361083249747, + "loss": 3.1587, + "theoretical_loss": 3.993159554052465, + "tokens_seen": 417205248 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411935807422267, + "loss": 3.0968, + "theoretical_loss": 3.993089879961623, + "tokens_seen": 417270784 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411835506519559, + "loss": 3.2023, + "theoretical_loss": 3.9930202198763114, + "tokens_seen": 417336320 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044117352056168507, + "loss": 3.0554, + "theoretical_loss": 3.992950573791518, + "tokens_seen": 417401856 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044116349047141425, + "loss": 3.2013, + "theoretical_loss": 3.9928809417022295, + "tokens_seen": 417467392 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411534603811435, + "loss": 3.0694, + "theoretical_loss": 3.992811323603438, + "tokens_seen": 417532928 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411434302908726, + "loss": 3.1439, + "theoretical_loss": 3.992741719490137, + "tokens_seen": 417598464 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044113340020060184, + "loss": 3.2814, + "theoretical_loss": 3.992672129357323, + "tokens_seen": 417664000 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044112337011033097, + "loss": 3.1426, + "theoretical_loss": 3.9926025531999945, + "tokens_seen": 417729536 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 690993, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.01431941986084, + "objective/train/theoretical_loss": 3.992550380250404, + "objective/train/tokens_used": 438238688, + "theoretical_loss": 3.992550380250404, + "tokens_seen": 417778688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411133400200602, + "loss": 3.1272, + "theoretical_loss": 3.9925329910131535, + "tokens_seen": 417795072 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004411033099297894, + "loss": 3.1022, + "theoretical_loss": 3.992463442791804, + "tokens_seen": 417860608 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044109327983951857, + "loss": 3.1021, + "theoretical_loss": 3.9923939085309517, + "tokens_seen": 417926144 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044108324974924775, + "loss": 2.8738, + "theoretical_loss": 3.992324388225607, + "tokens_seen": 417991680 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044107321965897693, + "loss": 3.0053, + "theoretical_loss": 3.992254881870781, + "tokens_seen": 418057216 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004410631895687061, + "loss": 3.179, + "theoretical_loss": 3.9921853894614885, + "tokens_seen": 418122752 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044105315947843535, + "loss": 3.1836, + "theoretical_loss": 3.9921159109927453, + "tokens_seen": 418188288 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004410431293881645, + "loss": 3.0575, + "theoretical_loss": 3.9920464464595717, + "tokens_seen": 418253824 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004410330992978937, + "loss": 3.1543, + "theoretical_loss": 3.99197699585699, + "tokens_seen": 418319360 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044102306920762284, + "loss": 3.1153, + "theoretical_loss": 3.9919075591800235, + "tokens_seen": 418384896 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044101303911735207, + "loss": 3.1589, + "theoretical_loss": 3.9918381364237003, + "tokens_seen": 418450432 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044100300902708125, + "loss": 3.2453, + "theoretical_loss": 3.9917687275830493, + "tokens_seen": 418515968 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044099297893681043, + "loss": 3.1177, + "theoretical_loss": 3.991699332653104, + "tokens_seen": 418581504 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004409829488465396, + "loss": 3.0618, + "theoretical_loss": 3.991629951628898, + "tokens_seen": 418647040 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044097291875626885, + "loss": 3.3568, + "theoretical_loss": 3.991560584505469, + "tokens_seen": 418712576 + }, + { + "epoch": 1.02, + "learning_rate": 0.000440962888665998, + "loss": 3.0912, + "theoretical_loss": 3.9914912312778568, + "tokens_seen": 418778112 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004409528585757272, + "loss": 3.0623, + "theoretical_loss": 3.9914218919411035, + "tokens_seen": 418843648 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044094282848545634, + "loss": 3.1834, + "theoretical_loss": 3.9913525664902547, + "tokens_seen": 418909184 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004409327983951856, + "loss": 3.1081, + "theoretical_loss": 3.991283254920358, + "tokens_seen": 418974720 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044092276830491476, + "loss": 3.2273, + "theoretical_loss": 3.9912139572264618, + "tokens_seen": 419040256 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044091273821464394, + "loss": 3.1318, + "theoretical_loss": 3.9911446734036207, + "tokens_seen": 419105792 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004409027081243731, + "loss": 2.9195, + "theoretical_loss": 3.9910754034468887, + "tokens_seen": 419171328 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408926780341023, + "loss": 3.047, + "theoretical_loss": 3.9910061473513236, + "tokens_seen": 419236864 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408826479438315, + "loss": 3.3012, + "theoretical_loss": 3.990936905111986, + "tokens_seen": 419302400 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408726178535607, + "loss": 3.1094, + "theoretical_loss": 3.990867676723938, + "tokens_seen": 419367936 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 693733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9626636505126953, + "objective/train/theoretical_loss": 3.9908157645198425, + "objective/train/tokens_used": 439877088, + "theoretical_loss": 3.9908157645198425, + "tokens_seen": 419417088 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044086258776328984, + "loss": 3.1299, + "theoretical_loss": 3.990798462182245, + "tokens_seen": 419433472 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408525576730191, + "loss": 3.2119, + "theoretical_loss": 3.990729261481975, + "tokens_seen": 419499008 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408425275827482, + "loss": 3.0502, + "theoretical_loss": 3.9906600746181984, + "tokens_seen": 419564544 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044083249749247744, + "loss": 3.1387, + "theoretical_loss": 3.9905909015859873, + "tokens_seen": 419630080 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408224674022066, + "loss": 3.1051, + "theoretical_loss": 3.990521742380418, + "tokens_seen": 419695616 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004408124373119358, + "loss": 3.2158, + "theoretical_loss": 3.990452596996567, + "tokens_seen": 419761152 + }, + { + "epoch": 1.02, + "learning_rate": 0.000440802407221665, + "loss": 3.1952, + "theoretical_loss": 3.9903834654295167, + "tokens_seen": 419826688 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407923771313942, + "loss": 3.3002, + "theoretical_loss": 3.9903143476743486, + "tokens_seen": 419892224 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044078234704112335, + "loss": 3.1461, + "theoretical_loss": 3.9902452437261475, + "tokens_seen": 419957760 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407723169508526, + "loss": 3.2976, + "theoretical_loss": 3.990176153580003, + "tokens_seen": 420023296 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407622868605817, + "loss": 3.2354, + "theoretical_loss": 3.9901070772310048, + "tokens_seen": 420088832 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044075225677031094, + "loss": 3.2347, + "theoretical_loss": 3.9900380146742456, + "tokens_seen": 420154368 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407422266800401, + "loss": 3.0218, + "theoretical_loss": 3.989968965904821, + "tokens_seen": 420219904 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407321965897693, + "loss": 3.1873, + "theoretical_loss": 3.9898999309178294, + "tokens_seen": 420285440 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004407221664994985, + "loss": 3.0513, + "theoretical_loss": 3.989830909708371, + "tokens_seen": 420350976 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044071213640922767, + "loss": 3.0534, + "theoretical_loss": 3.9897619022715483, + "tokens_seen": 420416512 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044070210631895685, + "loss": 2.8991, + "theoretical_loss": 3.9896929086024677, + "tokens_seen": 420482048 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004406920762286861, + "loss": 3.0426, + "theoretical_loss": 3.9896239286962367, + "tokens_seen": 420547584 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044068204613841527, + "loss": 3.2186, + "theoretical_loss": 3.9895549625479654, + "tokens_seen": 420613120 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044067201604814445, + "loss": 3.1937, + "theoretical_loss": 3.989486010152768, + "tokens_seen": 420678656 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004406619859578737, + "loss": 3.1626, + "theoretical_loss": 3.9894170715057586, + "tokens_seen": 420744192 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004406519558676028, + "loss": 3.1007, + "theoretical_loss": 3.989348146602056, + "tokens_seen": 420809728 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044064192577733204, + "loss": 3.1494, + "theoretical_loss": 3.9892792354367805, + "tokens_seen": 420875264 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044063189568706117, + "loss": 3.0951, + "theoretical_loss": 3.989210338005055, + "tokens_seen": 420940800 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004406218655967904, + "loss": 3.26, + "theoretical_loss": 3.9891414543020054, + "tokens_seen": 421006336 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 695092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.029510498046875, + "objective/train/theoretical_loss": 3.98908980053123, + "objective/train/tokens_used": 441515488, + "theoretical_loss": 3.98908980053123, + "tokens_seen": 421055488 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004406118355065196, + "loss": 3.173, + "theoretical_loss": 3.9890725843227592, + "tokens_seen": 421071872 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044060180541624877, + "loss": 3.112, + "theoretical_loss": 3.989003728062446, + "tokens_seen": 421137408 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044059177532597795, + "loss": 3.0469, + "theoretical_loss": 3.988934885516201, + "tokens_seen": 421202944 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044058174523570713, + "loss": 3.2076, + "theoretical_loss": 3.9888660566791576, + "tokens_seen": 421268480 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004405717151454363, + "loss": 3.1368, + "theoretical_loss": 3.9887972415464548, + "tokens_seen": 421334016 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044056168505516555, + "loss": 3.0653, + "theoretical_loss": 3.988728440113232, + "tokens_seen": 421399552 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004405516549648947, + "loss": 3.2541, + "theoretical_loss": 3.9886596523746327, + "tokens_seen": 421465088 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004405416248746239, + "loss": 3.0114, + "theoretical_loss": 3.9885908783258026, + "tokens_seen": 421530624 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044053159478435304, + "loss": 3.1663, + "theoretical_loss": 3.988522117961888, + "tokens_seen": 421596160 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044052156469408227, + "loss": 3.0487, + "theoretical_loss": 3.988453371278041, + "tokens_seen": 421661696 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044051153460381145, + "loss": 3.135, + "theoretical_loss": 3.9883846382694133, + "tokens_seen": 421727232 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044050150451354063, + "loss": 3.29, + "theoretical_loss": 3.9883159189311606, + "tokens_seen": 421792768 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404914744232698, + "loss": 3.0735, + "theoretical_loss": 3.9882472132584397, + "tokens_seen": 421858304 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044048144433299905, + "loss": 2.9283, + "theoretical_loss": 3.988178521246412, + "tokens_seen": 421923840 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404714142427282, + "loss": 3.1242, + "theoretical_loss": 3.988109842890239, + "tokens_seen": 421989376 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404613841524574, + "loss": 3.1417, + "theoretical_loss": 3.988041178185087, + "tokens_seen": 422054912 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044045135406218654, + "loss": 3.3378, + "theoretical_loss": 3.987972527126122, + "tokens_seen": 422120448 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404413239719158, + "loss": 3.1697, + "theoretical_loss": 3.987903889708515, + "tokens_seen": 422185984 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044043129388164496, + "loss": 3.1454, + "theoretical_loss": 3.987835265927439, + "tokens_seen": 422251520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044042126379137414, + "loss": 3.0017, + "theoretical_loss": 3.9877666557780675, + "tokens_seen": 422317056 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404112337011033, + "loss": 3.0379, + "theoretical_loss": 3.987698059255579, + "tokens_seen": 422382592 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004404012036108325, + "loss": 2.9597, + "theoretical_loss": 3.987629476355153, + "tokens_seen": 422448128 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403911735205617, + "loss": 3.0509, + "theoretical_loss": 3.9875609070719715, + "tokens_seen": 422513664 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403811434302909, + "loss": 3.1401, + "theoretical_loss": 3.9874923514012193, + "tokens_seen": 422579200 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044037111334002004, + "loss": 3.1011, + "theoretical_loss": 3.9874238093380843, + "tokens_seen": 422644736 + }, + { + "epoch": 1.02, + "objective/train/docs_used": 697758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1520400047302246, + "objective/train/theoretical_loss": 3.9873724117178373, + "objective/train/tokens_used": 443153888, + "theoretical_loss": 3.9873724117178373, + "tokens_seen": 422693888 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403610832497493, + "loss": 3.1451, + "theoretical_loss": 3.987355280877755, + "tokens_seen": 422710272 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403510531594784, + "loss": 3.1837, + "theoretical_loss": 3.9872867660154245, + "tokens_seen": 422775808 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044034102306920764, + "loss": 3.1477, + "theoretical_loss": 3.9872182647462866, + "tokens_seen": 422841344 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403309929789368, + "loss": 3.1252, + "theoretical_loss": 3.9871497770655386, + "tokens_seen": 422906880 + }, + { + "epoch": 1.02, + "learning_rate": 0.000440320962888666, + "loss": 3.1594, + "theoretical_loss": 3.9870813029683796, + "tokens_seen": 422972416 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403109327983952, + "loss": 3.1793, + "theoretical_loss": 3.987012842450012, + "tokens_seen": 423037952 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004403009027081244, + "loss": 2.9637, + "theoretical_loss": 3.98694439550564, + "tokens_seen": 423103488 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044029087261785355, + "loss": 3.0793, + "theoretical_loss": 3.98687596213047, + "tokens_seen": 423169024 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402808425275828, + "loss": 3.1454, + "theoretical_loss": 3.9868075423197107, + "tokens_seen": 423234560 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402708124373119, + "loss": 2.9815, + "theoretical_loss": 3.986739136068574, + "tokens_seen": 423300096 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044026078234704114, + "loss": 3.1202, + "theoretical_loss": 3.986670743372275, + "tokens_seen": 423365632 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402507522567703, + "loss": 3.2653, + "theoretical_loss": 3.9866023642260293, + "tokens_seen": 423431168 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402407221664995, + "loss": 2.9485, + "theoretical_loss": 3.986533998625056, + "tokens_seen": 423496704 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402306920762287, + "loss": 3.064, + "theoretical_loss": 3.986465646564575, + "tokens_seen": 423562240 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044022066198595787, + "loss": 3.2005, + "theoretical_loss": 3.9863973080398125, + "tokens_seen": 423627776 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044021063189568705, + "loss": 3.2412, + "theoretical_loss": 3.9863289830459925, + "tokens_seen": 423693312 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004402006018054163, + "loss": 3.1539, + "theoretical_loss": 3.986260671578345, + "tokens_seen": 423758848 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004401905717151454, + "loss": 3.224, + "theoretical_loss": 3.9861923736321003, + "tokens_seen": 423824384 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044018054162487465, + "loss": 3.02, + "theoretical_loss": 3.986124089202492, + "tokens_seen": 423889920 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044017051153460383, + "loss": 3.1558, + "theoretical_loss": 3.9860558182847554, + "tokens_seen": 423955456 + }, + { + "epoch": 1.02, + "learning_rate": 0.000440160481444333, + "loss": 3.1493, + "theoretical_loss": 3.9859875608741295, + "tokens_seen": 424020992 + }, + { + "epoch": 1.02, + "learning_rate": 0.0004401504513540622, + "loss": 3.276, + "theoretical_loss": 3.985919316965855, + "tokens_seen": 424086528 + }, + { + "epoch": 1.02, + "learning_rate": 0.00044014042126379137, + "loss": 3.1029, + "theoretical_loss": 3.985851086555174, + "tokens_seen": 424152064 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044013039117352055, + "loss": 3.1585, + "theoretical_loss": 3.985782869637333, + "tokens_seen": 424217600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004401203610832498, + "loss": 3.2466, + "theoretical_loss": 3.9857146662075795, + "tokens_seen": 424283136 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 700507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1688287258148193, + "objective/train/theoretical_loss": 3.9856635224839643, + "objective/train/tokens_used": 444792288, + "theoretical_loss": 3.9856635224839643, + "tokens_seen": 424332288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004401103309929789, + "loss": 3.2016, + "theoretical_loss": 3.985646476261164, + "tokens_seen": 424348672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044010030090270815, + "loss": 2.9694, + "theoretical_loss": 3.985578299793339, + "tokens_seen": 424414208 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400902708124373, + "loss": 3.0216, + "theoretical_loss": 3.985510136799359, + "tokens_seen": 424479744 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400802407221665, + "loss": 3.1776, + "theoretical_loss": 3.985441987274483, + "tokens_seen": 424545280 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400702106318957, + "loss": 3.0371, + "theoretical_loss": 3.9853738512139696, + "tokens_seen": 424610816 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400601805416249, + "loss": 3.1462, + "theoretical_loss": 3.9853057286130813, + "tokens_seen": 424676352 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044005015045135406, + "loss": 3.2018, + "theoretical_loss": 3.9852376194670835, + "tokens_seen": 424741888 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044004012036108324, + "loss": 3.2218, + "theoretical_loss": 3.985169523771243, + "tokens_seen": 424807424 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400300902708124, + "loss": 3.2237, + "theoretical_loss": 3.9851014415208286, + "tokens_seen": 424872960 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044002006018054165, + "loss": 3.1737, + "theoretical_loss": 3.985033372711113, + "tokens_seen": 424938496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004400100300902708, + "loss": 3.2252, + "theoretical_loss": 3.9849653173373705, + "tokens_seen": 425004032 + }, + { + "epoch": 1.03, + "learning_rate": 0.00044, + "loss": 3.1051, + "theoretical_loss": 3.984897275394877, + "tokens_seen": 425069568 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004399899699097292, + "loss": 3.1097, + "theoretical_loss": 3.9848292468789124, + "tokens_seen": 425135104 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004399799398194584, + "loss": 3.024, + "theoretical_loss": 3.9847612317847574, + "tokens_seen": 425200640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043996990972918756, + "loss": 3.0288, + "theoretical_loss": 3.9846932301076965, + "tokens_seen": 425266176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043995987963891674, + "loss": 3.2102, + "theoretical_loss": 3.984625241843016, + "tokens_seen": 425331712 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004399498495486459, + "loss": 3.1089, + "theoretical_loss": 3.984557266986004, + "tokens_seen": 425397248 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043993981945837516, + "loss": 3.1852, + "theoretical_loss": 3.9844893055319517, + "tokens_seen": 425462784 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043992978936810434, + "loss": 3.362, + "theoretical_loss": 3.984421357476152, + "tokens_seen": 425528320 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004399197592778335, + "loss": 3.2398, + "theoretical_loss": 3.9843534228139017, + "tokens_seen": 425593856 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004399097291875627, + "loss": 3.0803, + "theoretical_loss": 3.984285501540498, + "tokens_seen": 425659392 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398996990972919, + "loss": 3.1963, + "theoretical_loss": 3.9842175936512416, + "tokens_seen": 425724928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398896690070211, + "loss": 3.0382, + "theoretical_loss": 3.9841496991414354, + "tokens_seen": 425790464 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043987963891675024, + "loss": 2.9566, + "theoretical_loss": 3.984081818006384, + "tokens_seen": 425856000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398696088264795, + "loss": 3.0157, + "theoretical_loss": 3.9840139502413967, + "tokens_seen": 425921536 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 703399, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3628578186035156, + "objective/train/theoretical_loss": 3.9839630581889383, + "objective/train/tokens_used": 446430688, + "theoretical_loss": 3.9839630581889383, + "tokens_seen": 425970688 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398595787362086, + "loss": 3.0458, + "theoretical_loss": 3.983946095841782, + "tokens_seen": 425987072 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043984954864593784, + "loss": 3.1956, + "theoretical_loss": 3.9838782548028524, + "tokens_seen": 426052608 + }, + { + "epoch": 1.03, + "learning_rate": 0.000439839518555667, + "loss": 3.1856, + "theoretical_loss": 3.983810427119923, + "tokens_seen": 426118144 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398294884653962, + "loss": 3.1849, + "theoretical_loss": 3.983742612788311, + "tokens_seen": 426183680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398194583751254, + "loss": 3.321, + "theoretical_loss": 3.983674811803335, + "tokens_seen": 426249216 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004398094282848546, + "loss": 3.0916, + "theoretical_loss": 3.9836070241603174, + "tokens_seen": 426314752 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043979939819458375, + "loss": 3.0679, + "theoretical_loss": 3.9835392498545827, + "tokens_seen": 426380288 + }, + { + "epoch": 1.03, + "learning_rate": 0.000439789368104313, + "loss": 3.055, + "theoretical_loss": 3.983471488881456, + "tokens_seen": 426445824 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004397793380140421, + "loss": 3.1936, + "theoretical_loss": 3.983403741236268, + "tokens_seen": 426511360 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043976930792377134, + "loss": 3.2728, + "theoretical_loss": 3.9833360069143486, + "tokens_seen": 426576896 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004397592778335005, + "loss": 3.2623, + "theoretical_loss": 3.983268285911032, + "tokens_seen": 426642432 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004397492477432297, + "loss": 3.1833, + "theoretical_loss": 3.9832005782216537, + "tokens_seen": 426707968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004397392176529589, + "loss": 3.0795, + "theoretical_loss": 3.9831328838415523, + "tokens_seen": 426773504 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043972918756268807, + "loss": 3.1716, + "theoretical_loss": 3.9830652027660682, + "tokens_seen": 426839040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043971915747241725, + "loss": 3.0024, + "theoretical_loss": 3.9829975349905444, + "tokens_seen": 426904576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004397091273821465, + "loss": 3.0891, + "theoretical_loss": 3.9829298805103264, + "tokens_seen": 426970112 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004396990972918756, + "loss": 3.0684, + "theoretical_loss": 3.9828622393207613, + "tokens_seen": 427035648 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043968906720160485, + "loss": 3.1981, + "theoretical_loss": 3.9827946114171997, + "tokens_seen": 427101184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043967903711133403, + "loss": 3.0749, + "theoretical_loss": 3.9827269967949936, + "tokens_seen": 427166720 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004396690070210632, + "loss": 2.9922, + "theoretical_loss": 3.982659395449497, + "tokens_seen": 427232256 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004396589769307924, + "loss": 3.1143, + "theoretical_loss": 3.982591807376069, + "tokens_seen": 427297792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043964894684052157, + "loss": 3.0944, + "theoretical_loss": 3.9825242325700665, + "tokens_seen": 427363328 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043963891675025075, + "loss": 3.0587, + "theoretical_loss": 3.9824566710268527, + "tokens_seen": 427428864 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043962888665998, + "loss": 3.0447, + "theoretical_loss": 3.982389122741791, + "tokens_seen": 427494400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004396188565697091, + "loss": 3.0866, + "theoretical_loss": 3.9823215877102482, + "tokens_seen": 427559936 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 706426, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.177389144897461, + "objective/train/theoretical_loss": 3.9822709451314258, + "objective/train/tokens_used": 448069088, + "theoretical_loss": 3.9822709451314258, + "tokens_seen": 427609088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043960882647943835, + "loss": 3.2174, + "theoretical_loss": 3.9822540659275916, + "tokens_seen": 427625472 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395987963891675, + "loss": 3.1795, + "theoretical_loss": 3.982186557389194, + "tokens_seen": 427691008 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395887662988967, + "loss": 3.2016, + "theoretical_loss": 3.982119062090428, + "tokens_seen": 427756544 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395787362086259, + "loss": 2.9589, + "theoretical_loss": 3.982051580026669, + "tokens_seen": 427822080 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395687061183551, + "loss": 3.0058, + "theoretical_loss": 3.981984111193295, + "tokens_seen": 427887616 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043955867602808426, + "loss": 3.0916, + "theoretical_loss": 3.981916655585687, + "tokens_seen": 427953152 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043954864593781344, + "loss": 3.2664, + "theoretical_loss": 3.981849213199227, + "tokens_seen": 428018688 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395386158475426, + "loss": 3.1716, + "theoretical_loss": 3.9817817840293, + "tokens_seen": 428084224 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043952858575727185, + "loss": 3.025, + "theoretical_loss": 3.9817143680712928, + "tokens_seen": 428149760 + }, + { + "epoch": 1.03, + "learning_rate": 0.000439518555667001, + "loss": 3.1716, + "theoretical_loss": 3.9816469653205955, + "tokens_seen": 428215296 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004395085255767302, + "loss": 3.0316, + "theoretical_loss": 3.9815795757725994, + "tokens_seen": 428280832 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394984954864594, + "loss": 3.1629, + "theoretical_loss": 3.9815121994226996, + "tokens_seen": 428346368 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394884653961886, + "loss": 3.0088, + "theoretical_loss": 3.9814448362662924, + "tokens_seen": 428411904 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043947843530591776, + "loss": 3.0621, + "theoretical_loss": 3.9813774862987756, + "tokens_seen": 428477440 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043946840521564694, + "loss": 3.2224, + "theoretical_loss": 3.9813101495155516, + "tokens_seen": 428542976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394583751253761, + "loss": 3.1106, + "theoretical_loss": 3.981242825912023, + "tokens_seen": 428608512 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043944834503510536, + "loss": 3.1592, + "theoretical_loss": 3.981175515483596, + "tokens_seen": 428674048 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394383149448345, + "loss": 3.2254, + "theoretical_loss": 3.9811082182256783, + "tokens_seen": 428739584 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394282848545637, + "loss": 3.3119, + "theoretical_loss": 3.9810409341336808, + "tokens_seen": 428805120 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043941825476429285, + "loss": 3.0989, + "theoretical_loss": 3.9809736632030153, + "tokens_seen": 428870656 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004394082246740221, + "loss": 3.1474, + "theoretical_loss": 3.9809064054290975, + "tokens_seen": 428936192 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043939819458375126, + "loss": 3.2748, + "theoretical_loss": 3.980839160807344, + "tokens_seen": 429001728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043938816449348044, + "loss": 3.0667, + "theoretical_loss": 3.9807719293331743, + "tokens_seen": 429067264 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004393781344032096, + "loss": 3.2634, + "theoretical_loss": 3.9807047110020104, + "tokens_seen": 429132800 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004393681043129388, + "loss": 3.1843, + "theoretical_loss": 3.980637505809277, + "tokens_seen": 429198336 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 707874, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.165225028991699, + "objective/train/theoretical_loss": 3.9805871105340698, + "objective/train/tokens_used": 449707488, + "theoretical_loss": 3.9805871105340698, + "tokens_seen": 429247488 + }, + { + "epoch": 1.03, + "learning_rate": 0.000439358074222668, + "loss": 3.1692, + "theoretical_loss": 3.9805703137503996, + "tokens_seen": 429263872 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004393480441323972, + "loss": 3.0977, + "theoretical_loss": 3.9805031348208075, + "tokens_seen": 429329408 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043933801404212635, + "loss": 2.985, + "theoretical_loss": 3.9804359690159314, + "tokens_seen": 429394944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004393279839518556, + "loss": 2.9109, + "theoretical_loss": 3.9803688163312048, + "tokens_seen": 429460480 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043931795386158477, + "loss": 3.1, + "theoretical_loss": 3.980301676762063, + "tokens_seen": 429526016 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043930792377131395, + "loss": 3.0829, + "theoretical_loss": 3.9802345503039445, + "tokens_seen": 429591552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043929789368104313, + "loss": 3.1845, + "theoretical_loss": 3.9801674369522884, + "tokens_seen": 429657088 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392878635907723, + "loss": 3.0632, + "theoretical_loss": 3.980100336702537, + "tokens_seen": 429722624 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392778335005015, + "loss": 3.1341, + "theoretical_loss": 3.9800332495501367, + "tokens_seen": 429788160 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392678034102307, + "loss": 3.2055, + "theoretical_loss": 3.979966175490533, + "tokens_seen": 429853696 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043925777331995985, + "loss": 3.1357, + "theoretical_loss": 3.979899114519175, + "tokens_seen": 429919232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392477432296891, + "loss": 3.2381, + "theoretical_loss": 3.9798320666315146, + "tokens_seen": 429984768 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392377131394182, + "loss": 3.3833, + "theoretical_loss": 3.979765031823006, + "tokens_seen": 430050304 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043922768304914745, + "loss": 3.1487, + "theoretical_loss": 3.979698010089105, + "tokens_seen": 430115840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043921765295887663, + "loss": 3.2069, + "theoretical_loss": 3.97963100142527, + "tokens_seen": 430181376 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004392076228686058, + "loss": 3.0806, + "theoretical_loss": 3.979564005826961, + "tokens_seen": 430246912 + }, + { + "epoch": 1.03, + "learning_rate": 0.000439197592778335, + "loss": 3.0815, + "theoretical_loss": 3.9794970232896416, + "tokens_seen": 430312448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043918756268806423, + "loss": 3.0444, + "theoretical_loss": 3.979430053808777, + "tokens_seen": 430377984 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004391775325977934, + "loss": 2.9991, + "theoretical_loss": 3.9793630973798333, + "tokens_seen": 430443520 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004391675025075226, + "loss": 3.1527, + "theoretical_loss": 3.979296153998282, + "tokens_seen": 430509056 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043915747241725177, + "loss": 3.2593, + "theoretical_loss": 3.979229223659593, + "tokens_seen": 430574592 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043914744232698095, + "loss": 3.1108, + "theoretical_loss": 3.9791623063592425, + "tokens_seen": 430640128 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004391374122367102, + "loss": 3.045, + "theoretical_loss": 3.979095402092706, + "tokens_seen": 430705664 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004391273821464393, + "loss": 3.2107, + "theoretical_loss": 3.9790285108554624, + "tokens_seen": 430771200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043911735205616855, + "loss": 3.2452, + "theoretical_loss": 3.978961632642992, + "tokens_seen": 430836736 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 710735, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.428483724594116, + "objective/train/theoretical_loss": 3.9789114825284297, + "objective/train/tokens_used": 451345888, + "theoretical_loss": 3.9789114825284297, + "tokens_seen": 430885888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004391073219658977, + "loss": 3.2874, + "theoretical_loss": 3.978894767450779, + "tokens_seen": 430902272 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390972918756269, + "loss": 3.1046, + "theoretical_loss": 3.9788279152743082, + "tokens_seen": 430967808 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390872617853561, + "loss": 3.0451, + "theoretical_loss": 3.978761076109067, + "tokens_seen": 431033344 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390772316950853, + "loss": 3.1426, + "theoretical_loss": 3.9786942499505464, + "tokens_seen": 431098880 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043906720160481446, + "loss": 3.2194, + "theoretical_loss": 3.978627436794238, + "tokens_seen": 431164416 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043905717151454364, + "loss": 3.0053, + "theoretical_loss": 3.9785606366356365, + "tokens_seen": 431229952 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390471414242728, + "loss": 3.0812, + "theoretical_loss": 3.978493849470238, + "tokens_seen": 431295488 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043903711133400205, + "loss": 3.0093, + "theoretical_loss": 3.978427075293542, + "tokens_seen": 431361024 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390270812437312, + "loss": 2.983, + "theoretical_loss": 3.9783603141010495, + "tokens_seen": 431426560 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390170511534604, + "loss": 3.0844, + "theoretical_loss": 3.978293565888264, + "tokens_seen": 431492096 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004390070210631896, + "loss": 3.2232, + "theoretical_loss": 3.9782268306506916, + "tokens_seen": 431557632 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004389969909729188, + "loss": 3.086, + "theoretical_loss": 3.9781601083838396, + "tokens_seen": 431623168 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043898696088264796, + "loss": 3.0661, + "theoretical_loss": 3.9780933990832184, + "tokens_seen": 431688704 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043897693079237714, + "loss": 3.255, + "theoretical_loss": 3.97802670274434, + "tokens_seen": 431754240 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004389669007021063, + "loss": 3.2041, + "theoretical_loss": 3.97796001936272, + "tokens_seen": 431819776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043895687061183556, + "loss": 3.1765, + "theoretical_loss": 3.9778933489338737, + "tokens_seen": 431885312 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004389468405215647, + "loss": 3.2672, + "theoretical_loss": 3.9778266914533216, + "tokens_seen": 431950848 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004389368104312939, + "loss": 3.2317, + "theoretical_loss": 3.9777600469165844, + "tokens_seen": 432016384 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043892678034102305, + "loss": 3.1684, + "theoretical_loss": 3.9776934153191856, + "tokens_seen": 432081920 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004389167502507523, + "loss": 3.2044, + "theoretical_loss": 3.977626796656651, + "tokens_seen": 432147456 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043890672016048146, + "loss": 3.2265, + "theoretical_loss": 3.977560190924509, + "tokens_seen": 432212992 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043889669007021064, + "loss": 3.0488, + "theoretical_loss": 3.9774935981182895, + "tokens_seen": 432278528 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004388866599799398, + "loss": 3.2885, + "theoretical_loss": 3.977427018233525, + "tokens_seen": 432344064 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438876629889669, + "loss": 3.0455, + "theoretical_loss": 3.97736045126575, + "tokens_seen": 432409600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004388665997993982, + "loss": 3.0116, + "theoretical_loss": 3.977293897210501, + "tokens_seen": 432475136 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 713758, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9373741149902344, + "objective/train/theoretical_loss": 3.977243990140227, + "objective/train/tokens_used": 452984288, + "theoretical_loss": 3.977243990140227, + "tokens_seen": 432524288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004388565697091274, + "loss": 3.1311, + "theoretical_loss": 3.9772273560633185, + "tokens_seen": 432540672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043884653961885655, + "loss": 3.0376, + "theoretical_loss": 3.977160827819742, + "tokens_seen": 432606208 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004388365095285858, + "loss": 3.041, + "theoretical_loss": 3.977094312475317, + "tokens_seen": 432671744 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043882647943831497, + "loss": 3.1525, + "theoretical_loss": 3.9770278100255867, + "tokens_seen": 432737280 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043881644934804415, + "loss": 3.1463, + "theoretical_loss": 3.976961320466102, + "tokens_seen": 432802816 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043880641925777333, + "loss": 3.1172, + "theoretical_loss": 3.9768948437924108, + "tokens_seen": 432868352 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387963891675025, + "loss": 3.0196, + "theoretical_loss": 3.9768283800000663, + "tokens_seen": 432933888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387863590772317, + "loss": 3.0698, + "theoretical_loss": 3.976761929084623, + "tokens_seen": 432999424 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387763289869609, + "loss": 3.0758, + "theoretical_loss": 3.976695491041638, + "tokens_seen": 433064960 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043876629889669005, + "loss": 3.0381, + "theoretical_loss": 3.97662906586667, + "tokens_seen": 433130496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387562688064193, + "loss": 3.1778, + "theoretical_loss": 3.97656265355528, + "tokens_seen": 433196032 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387462387161484, + "loss": 3.0284, + "theoretical_loss": 3.976496254103032, + "tokens_seen": 433261568 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043873620862587765, + "loss": 3.0946, + "theoretical_loss": 3.976429867505491, + "tokens_seen": 433327104 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043872617853560683, + "loss": 3.103, + "theoretical_loss": 3.9763634937582246, + "tokens_seen": 433392640 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438716148445336, + "loss": 3.3019, + "theoretical_loss": 3.976297132856804, + "tokens_seen": 433458176 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004387061183550652, + "loss": 2.9494, + "theoretical_loss": 3.9762307847968, + "tokens_seen": 433523712 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043869608826479443, + "loss": 2.9237, + "theoretical_loss": 3.976164449573788, + "tokens_seen": 433589248 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043868605817452356, + "loss": 3.0651, + "theoretical_loss": 3.976098127183344, + "tokens_seen": 433654784 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004386760280842528, + "loss": 3.2603, + "theoretical_loss": 3.9760318176210476, + "tokens_seen": 433720320 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004386659979939819, + "loss": 3.1503, + "theoretical_loss": 3.975965520882479, + "tokens_seen": 433785856 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043865596790371115, + "loss": 3.0641, + "theoretical_loss": 3.9758992369632207, + "tokens_seen": 433851392 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043864593781344033, + "loss": 3.3106, + "theoretical_loss": 3.97583296585886, + "tokens_seen": 433916928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004386359077231695, + "loss": 3.1191, + "theoretical_loss": 3.9757667075649827, + "tokens_seen": 433982464 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004386258776328987, + "loss": 3.1559, + "theoretical_loss": 3.975700462077179, + "tokens_seen": 434048000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004386158475426279, + "loss": 3.0587, + "theoretical_loss": 3.975634229391041, + "tokens_seen": 434113536 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 716642, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9459481239318848, + "objective/train/theoretical_loss": 3.9755845632748805, + "objective/train/tokens_used": 454622688, + "theoretical_loss": 3.9755845632748805, + "tokens_seen": 434162688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043860581745235706, + "loss": 3.0542, + "theoretical_loss": 3.9755680095021635, + "tokens_seen": 434179072 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385957873620863, + "loss": 3.1689, + "theoretical_loss": 3.975501802406141, + "tokens_seen": 434244608 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385857572718154, + "loss": 3.1914, + "theoretical_loss": 3.9754356080985733, + "tokens_seen": 434310144 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043857572718154466, + "loss": 3.1199, + "theoretical_loss": 3.9753694265750603, + "tokens_seen": 434375680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385656970912738, + "loss": 3.2403, + "theoretical_loss": 3.9753032578312055, + "tokens_seen": 434441216 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438555667001003, + "loss": 3.0913, + "theoretical_loss": 3.975237101862614, + "tokens_seen": 434506752 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385456369107322, + "loss": 3.2943, + "theoretical_loss": 3.975170958664892, + "tokens_seen": 434572288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385356068204614, + "loss": 3.2356, + "theoretical_loss": 3.9751048282336496, + "tokens_seen": 434637824 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043852557673019056, + "loss": 3.1442, + "theoretical_loss": 3.9750387105644975, + "tokens_seen": 434703360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385155466399198, + "loss": 3.2731, + "theoretical_loss": 3.9749726056530506, + "tokens_seen": 434768896 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004385055165496489, + "loss": 3.1072, + "theoretical_loss": 3.9749065134949233, + "tokens_seen": 434834432 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043849548645937816, + "loss": 3.0025, + "theoretical_loss": 3.974840434085735, + "tokens_seen": 434899968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004384854563691073, + "loss": 3.2481, + "theoretical_loss": 3.9747743674211042, + "tokens_seen": 434965504 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004384754262788365, + "loss": 3.1779, + "theoretical_loss": 3.974708313496655, + "tokens_seen": 435031040 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004384653961885657, + "loss": 3.049, + "theoretical_loss": 3.974642272308011, + "tokens_seen": 435096576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004384553660982949, + "loss": 3.0178, + "theoretical_loss": 3.974576243850799, + "tokens_seen": 435162112 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043844533600802407, + "loss": 3.079, + "theoretical_loss": 3.9745102281206477, + "tokens_seen": 435227648 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043843530591775325, + "loss": 3.3066, + "theoretical_loss": 3.9744442251131877, + "tokens_seen": 435293184 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004384252758274825, + "loss": 3.1664, + "theoretical_loss": 3.9743782348240533, + "tokens_seen": 435358720 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043841524573721166, + "loss": 3.1793, + "theoretical_loss": 3.9743122572488785, + "tokens_seen": 435424256 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043840521564694084, + "loss": 3.0566, + "theoretical_loss": 3.9742462923833015, + "tokens_seen": 435489792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043839518555667, + "loss": 3.2643, + "theoretical_loss": 3.9741803402229623, + "tokens_seen": 435555328 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004383851554663992, + "loss": 3.0476, + "theoretical_loss": 3.9741144007635016, + "tokens_seen": 435620864 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004383751253761284, + "loss": 3.0621, + "theoretical_loss": 3.974048474000564, + "tokens_seen": 435686400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004383650952858576, + "loss": 3.1195, + "theoretical_loss": 3.9739825599297944, + "tokens_seen": 435751936 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 718634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0812394618988037, + "objective/train/theoretical_loss": 3.973933132703336, + "objective/train/tokens_used": 456261088, + "theoretical_loss": 3.973933132703336, + "tokens_seen": 435801088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043835506519558675, + "loss": 3.0478, + "theoretical_loss": 3.973916658546843, + "tokens_seen": 435817472 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438345035105316, + "loss": 3.1574, + "theoretical_loss": 3.9738507698473584, + "tokens_seen": 435883008 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043833500501504517, + "loss": 3.1449, + "theoretical_loss": 3.973784893826994, + "tokens_seen": 435948544 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043832497492477435, + "loss": 3.1732, + "theoretical_loss": 3.973719030481404, + "tokens_seen": 436014080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043831494483450353, + "loss": 3.0495, + "theoretical_loss": 3.9736531798062456, + "tokens_seen": 436079616 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004383049147442327, + "loss": 3.0259, + "theoretical_loss": 3.9735873417971774, + "tokens_seen": 436145152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382948846539619, + "loss": 3.1395, + "theoretical_loss": 3.97352151644986, + "tokens_seen": 436210688 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382848545636911, + "loss": 3.1631, + "theoretical_loss": 3.9734557037599574, + "tokens_seen": 436276224 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043827482447342025, + "loss": 3.037, + "theoretical_loss": 3.9733899037231346, + "tokens_seen": 436341760 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382647943831495, + "loss": 3.0361, + "theoretical_loss": 3.9733241163350597, + "tokens_seen": 436407296 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382547642928786, + "loss": 3.1084, + "theoretical_loss": 3.973258341591401, + "tokens_seen": 436472832 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043824473420260785, + "loss": 3.0336, + "theoretical_loss": 3.9731925794878307, + "tokens_seen": 436538368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043823470411233703, + "loss": 3.1471, + "theoretical_loss": 3.973126830020023, + "tokens_seen": 436603904 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382246740220662, + "loss": 3.0854, + "theoretical_loss": 3.9730610931836536, + "tokens_seen": 436669440 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004382146439317954, + "loss": 3.1501, + "theoretical_loss": 3.9729953689744013, + "tokens_seen": 436734976 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043820461384152463, + "loss": 3.3095, + "theoretical_loss": 3.972929657387945, + "tokens_seen": 436800512 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043819458375125376, + "loss": 3.292, + "theoretical_loss": 3.9728639584199685, + "tokens_seen": 436866048 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438184553660983, + "loss": 3.1544, + "theoretical_loss": 3.9727982720661554, + "tokens_seen": 436931584 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004381745235707121, + "loss": 3.1356, + "theoretical_loss": 3.972732598322193, + "tokens_seen": 436997120 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043816449348044135, + "loss": 3.2776, + "theoretical_loss": 3.972666937183769, + "tokens_seen": 437062656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043815446339017053, + "loss": 3.146, + "theoretical_loss": 3.972601288646575, + "tokens_seen": 437128192 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004381444332998997, + "loss": 3.314, + "theoretical_loss": 3.9725356527063043, + "tokens_seen": 437193728 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004381344032096289, + "loss": 3.049, + "theoretical_loss": 3.972470029358651, + "tokens_seen": 437259264 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004381243731193581, + "loss": 3.2304, + "theoretical_loss": 3.972404418599313, + "tokens_seen": 437324800 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043811434302908726, + "loss": 3.2305, + "theoretical_loss": 3.97233882042399, + "tokens_seen": 437390336 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 721367, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.284128427505493, + "objective/train/theoretical_loss": 3.9722896300481705, + "objective/train/tokens_used": 457899488, + "theoretical_loss": 3.9722896300481705, + "tokens_seen": 437439488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004381043129388165, + "loss": 3.1586, + "theoretical_loss": 3.972273234828382, + "tokens_seen": 437455872 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004380942828485456, + "loss": 3.0727, + "theoretical_loss": 3.972207661808194, + "tokens_seen": 437521408 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043808425275827486, + "loss": 3.2524, + "theoretical_loss": 3.9721421013591307, + "tokens_seen": 437586944 + }, + { + "epoch": 1.03, + "learning_rate": 0.000438074222668004, + "loss": 3.1346, + "theoretical_loss": 3.9720765534769007, + "tokens_seen": 437652480 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004380641925777332, + "loss": 3.145, + "theoretical_loss": 3.9720110181572132, + "tokens_seen": 437718016 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004380541624874624, + "loss": 3.0142, + "theoretical_loss": 3.971945495395781, + "tokens_seen": 437783552 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004380441323971916, + "loss": 3.1251, + "theoretical_loss": 3.971879985188317, + "tokens_seen": 437849088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043803410230692076, + "loss": 3.1286, + "theoretical_loss": 3.971814487530538, + "tokens_seen": 437914624 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043802407221665, + "loss": 3.1475, + "theoretical_loss": 3.9717490024181625, + "tokens_seen": 437980160 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004380140421263791, + "loss": 3.2249, + "theoretical_loss": 3.971683529846911, + "tokens_seen": 438045696 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043800401203610836, + "loss": 3.2395, + "theoretical_loss": 3.971618069812506, + "tokens_seen": 438111232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004379939819458375, + "loss": 3.111, + "theoretical_loss": 3.9715526223106714, + "tokens_seen": 438176768 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004379839518555667, + "loss": 3.2616, + "theoretical_loss": 3.971487187337134, + "tokens_seen": 438242304 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004379739217652959, + "loss": 3.1456, + "theoretical_loss": 3.9714217648876238, + "tokens_seen": 438307840 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004379638916750251, + "loss": 3.2372, + "theoretical_loss": 3.97135635495787, + "tokens_seen": 438373376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043795386158475427, + "loss": 3.1901, + "theoretical_loss": 3.9712909575436064, + "tokens_seen": 438438912 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043794383149448345, + "loss": 3.1276, + "theoretical_loss": 3.9712255726405683, + "tokens_seen": 438504448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043793380140421263, + "loss": 3.135, + "theoretical_loss": 3.9711602002444923, + "tokens_seen": 438569984 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043792377131394186, + "loss": 2.9493, + "theoretical_loss": 3.9710948403511184, + "tokens_seen": 438635520 + }, + { + "epoch": 1.03, + "learning_rate": 0.000437913741223671, + "loss": 3.1212, + "theoretical_loss": 3.9710294929561876, + "tokens_seen": 438701056 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004379037111334002, + "loss": 3.0782, + "theoretical_loss": 3.970964158055443, + "tokens_seen": 438766592 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043789368104312935, + "loss": 3.1428, + "theoretical_loss": 3.97089883564463, + "tokens_seen": 438832128 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004378836509528586, + "loss": 3.039, + "theoretical_loss": 3.9708335257194967, + "tokens_seen": 438897664 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043787362086258777, + "loss": 3.0138, + "theoretical_loss": 3.9707682282757926, + "tokens_seen": 438963200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043786359077231695, + "loss": 3.1478, + "theoretical_loss": 3.9707029433092695, + "tokens_seen": 439028736 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 724037, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.29388427734375, + "objective/train/theoretical_loss": 3.970653987769973, + "objective/train/tokens_used": 459537888, + "theoretical_loss": 3.970653987769973, + "tokens_seen": 439077888 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043785356068204613, + "loss": 3.1064, + "theoretical_loss": 3.970637670815681, + "tokens_seen": 439094272 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043784353059177537, + "loss": 3.2294, + "theoretical_loss": 3.9705724107907834, + "tokens_seen": 439159808 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004378335005015045, + "loss": 3.0611, + "theoretical_loss": 3.9705071632303346, + "tokens_seen": 439225344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043782347041123373, + "loss": 3.0616, + "theoretical_loss": 3.970441928130094, + "tokens_seen": 439290880 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043781344032096286, + "loss": 3.0596, + "theoretical_loss": 3.9703767054858248, + "tokens_seen": 439356416 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004378034102306921, + "loss": 3.1325, + "theoretical_loss": 3.9703114952932905, + "tokens_seen": 439421952 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043779338014042127, + "loss": 3.2289, + "theoretical_loss": 3.9702462975482575, + "tokens_seen": 439487488 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043778335005015045, + "loss": 2.9496, + "theoretical_loss": 3.9701811122464945, + "tokens_seen": 439553024 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043777331995987963, + "loss": 3.0722, + "theoretical_loss": 3.9701159393837715, + "tokens_seen": 439618560 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004377632898696088, + "loss": 2.963, + "theoretical_loss": 3.9700507789558612, + "tokens_seen": 439684096 + }, + { + "epoch": 1.03, + "learning_rate": 0.000437753259779338, + "loss": 3.0227, + "theoretical_loss": 3.969985630958538, + "tokens_seen": 439749632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043774322968906723, + "loss": 3.2489, + "theoretical_loss": 3.969920495387579, + "tokens_seen": 439815168 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043773319959879636, + "loss": 2.9723, + "theoretical_loss": 3.9698553722387624, + "tokens_seen": 439880704 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004377231695085256, + "loss": 3.1833, + "theoretical_loss": 3.969790261507869, + "tokens_seen": 439946240 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004377131394182547, + "loss": 3.1245, + "theoretical_loss": 3.969725163190682, + "tokens_seen": 440011776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043770310932798396, + "loss": 3.1032, + "theoretical_loss": 3.9696600772829855, + "tokens_seen": 440077312 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043769307923771314, + "loss": 3.0728, + "theoretical_loss": 3.9695950037805674, + "tokens_seen": 440142848 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376830491474423, + "loss": 3.009, + "theoretical_loss": 3.969529942679216, + "tokens_seen": 440208384 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043767301905717155, + "loss": 3.0969, + "theoretical_loss": 3.9694648939747226, + "tokens_seen": 440273920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043766298896690073, + "loss": 3.2049, + "theoretical_loss": 3.9693998576628795, + "tokens_seen": 440339456 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376529588766299, + "loss": 3.0676, + "theoretical_loss": 3.9693348337394836, + "tokens_seen": 440404992 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376429287863591, + "loss": 3.2413, + "theoretical_loss": 3.9692698222003306, + "tokens_seen": 440470528 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376328986960883, + "loss": 3.1218, + "theoretical_loss": 3.969204823041221, + "tokens_seen": 440536064 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043762286860581746, + "loss": 3.0499, + "theoretical_loss": 3.9691398362579546, + "tokens_seen": 440601600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376128385155467, + "loss": 3.2435, + "theoretical_loss": 3.9690748618463356, + "tokens_seen": 440667136 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 726896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1499991416931152, + "objective/train/theoretical_loss": 3.969026139153992, + "objective/train/tokens_used": 461176288, + "theoretical_loss": 3.969026139153992, + "tokens_seen": 440716288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004376028084252758, + "loss": 2.9195, + "theoretical_loss": 3.9690098998021694, + "tokens_seen": 440732672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043759277833500506, + "loss": 3.1561, + "theoretical_loss": 3.968944950121264, + "tokens_seen": 440798208 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375827482447342, + "loss": 3.2465, + "theoretical_loss": 3.9688800127994277, + "tokens_seen": 440863744 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375727181544634, + "loss": 3.2049, + "theoretical_loss": 3.968815087832473, + "tokens_seen": 440929280 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375626880641926, + "loss": 3.098, + "theoretical_loss": 3.968750175216213, + "tokens_seen": 440994816 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375526579739218, + "loss": 2.9673, + "theoretical_loss": 3.9686852749464627, + "tokens_seen": 441060352 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043754262788365096, + "loss": 2.932, + "theoretical_loss": 3.9686203870190413, + "tokens_seen": 441125888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375325977933802, + "loss": 3.2043, + "theoretical_loss": 3.9685555114297673, + "tokens_seen": 441191424 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375225677031093, + "loss": 3.1332, + "theoretical_loss": 3.968490648174463, + "tokens_seen": 441256960 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043751253761283856, + "loss": 3.1691, + "theoretical_loss": 3.968425797248952, + "tokens_seen": 441322496 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004375025075225677, + "loss": 2.9748, + "theoretical_loss": 3.96836095864906, + "tokens_seen": 441388032 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004374924774322969, + "loss": 3.174, + "theoretical_loss": 3.9682961323706145, + "tokens_seen": 441453568 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004374824473420261, + "loss": 3.1999, + "theoretical_loss": 3.968231318409446, + "tokens_seen": 441519104 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004374724172517553, + "loss": 3.1849, + "theoretical_loss": 3.968166516761386, + "tokens_seen": 441584640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043746238716148447, + "loss": 3.0837, + "theoretical_loss": 3.968101727422269, + "tokens_seen": 441650176 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043745235707121365, + "loss": 3.0493, + "theoretical_loss": 3.9680369503879303, + "tokens_seen": 441715712 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043744232698094283, + "loss": 3.0785, + "theoretical_loss": 3.967972185654208, + "tokens_seen": 441781248 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043743229689067206, + "loss": 3.1332, + "theoretical_loss": 3.967907433216942, + "tokens_seen": 441846784 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004374222668004012, + "loss": 3.0856, + "theoretical_loss": 3.967842693071975, + "tokens_seen": 441912320 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004374122367101304, + "loss": 3.1272, + "theoretical_loss": 3.9677779652151504, + "tokens_seen": 441977856 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043740220661985955, + "loss": 3.1817, + "theoretical_loss": 3.9677132496423146, + "tokens_seen": 442043392 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004373921765295888, + "loss": 3.1118, + "theoretical_loss": 3.967648546349315, + "tokens_seen": 442108928 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043738214643931797, + "loss": 3.2312, + "theoretical_loss": 3.967583855332003, + "tokens_seen": 442174464 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043737211634904715, + "loss": 3.2529, + "theoretical_loss": 3.9675191765862294, + "tokens_seen": 442240000 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043736208625877633, + "loss": 3.1077, + "theoretical_loss": 3.967454510107849, + "tokens_seen": 442305536 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 729712, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1680171489715576, + "objective/train/theoretical_loss": 3.967406018297048, + "objective/train/tokens_used": 462814688, + "theoretical_loss": 3.967406018297048, + "tokens_seen": 442354688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043735205616850557, + "loss": 3.1412, + "theoretical_loss": 3.9673898558927183, + "tokens_seen": 442371072 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004373420260782347, + "loss": 3.1169, + "theoretical_loss": 3.967325213936695, + "tokens_seen": 442436608 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043733199598796393, + "loss": 3.1741, + "theoretical_loss": 3.9672605842356385, + "tokens_seen": 442502144 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043732196589769306, + "loss": 3.188, + "theoretical_loss": 3.9671959667854124, + "tokens_seen": 442567680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004373119358074223, + "loss": 3.2639, + "theoretical_loss": 3.96713136158188, + "tokens_seen": 442633216 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043730190571715147, + "loss": 3.1499, + "theoretical_loss": 3.967066768620908, + "tokens_seen": 442698752 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043729187562688065, + "loss": 3.0619, + "theoretical_loss": 3.9670021878983643, + "tokens_seen": 442764288 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043728184553660983, + "loss": 3.0824, + "theoretical_loss": 3.9669376194101194, + "tokens_seen": 442829824 + }, + { + "epoch": 1.03, + "learning_rate": 0.000437271815446339, + "loss": 3.2252, + "theoretical_loss": 3.9668730631520455, + "tokens_seen": 442895360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004372617853560682, + "loss": 3.1495, + "theoretical_loss": 3.9668085191200166, + "tokens_seen": 442960896 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043725175526579743, + "loss": 2.9111, + "theoretical_loss": 3.9667439873099086, + "tokens_seen": 443026432 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043724172517552656, + "loss": 3.0373, + "theoretical_loss": 3.9666794677176007, + "tokens_seen": 443091968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004372316950852558, + "loss": 2.9709, + "theoretical_loss": 3.9666149603389727, + "tokens_seen": 443157504 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004372216649949849, + "loss": 2.8703, + "theoretical_loss": 3.966550465169906, + "tokens_seen": 443223040 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043721163490471416, + "loss": 3.0921, + "theoretical_loss": 3.9664859822062866, + "tokens_seen": 443288576 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043720160481444334, + "loss": 3.1214, + "theoretical_loss": 3.9664215114439987, + "tokens_seen": 443354112 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004371915747241725, + "loss": 3.2286, + "theoretical_loss": 3.966357052878932, + "tokens_seen": 443419648 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004371815446339017, + "loss": 3.1536, + "theoretical_loss": 3.9662926065069763, + "tokens_seen": 443485184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043717151454363093, + "loss": 3.1096, + "theoretical_loss": 3.9662281723240236, + "tokens_seen": 443550720 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043716148445336006, + "loss": 3.1053, + "theoretical_loss": 3.966163750325968, + "tokens_seen": 443616256 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004371514543630893, + "loss": 3.3097, + "theoretical_loss": 3.966099340508706, + "tokens_seen": 443681792 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004371414242728184, + "loss": 3.1333, + "theoretical_loss": 3.9660349428681356, + "tokens_seen": 443747328 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043713139418254766, + "loss": 2.9667, + "theoretical_loss": 3.9659705574001567, + "tokens_seen": 443812864 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043712136409227684, + "loss": 3.2184, + "theoretical_loss": 3.9659061841006724, + "tokens_seen": 443878400 + }, + { + "epoch": 1.03, + "learning_rate": 0.000437111334002006, + "loss": 3.2269, + "theoretical_loss": 3.9658418229655856, + "tokens_seen": 443943936 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 731084, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.936250925064087, + "objective/train/theoretical_loss": 3.9657935600946943, + "objective/train/tokens_used": 464453088, + "theoretical_loss": 3.9657935600946943, + "tokens_seen": 443993088 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004371013039117352, + "loss": 3.1743, + "theoretical_loss": 3.9657774739908036, + "tokens_seen": 444009472 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370912738214644, + "loss": 3.0883, + "theoretical_loss": 3.9657131371722336, + "tokens_seen": 444075008 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043708124373119356, + "loss": 3.1886, + "theoretical_loss": 3.965648812505786, + "tokens_seen": 444140544 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370712136409228, + "loss": 3.0547, + "theoretical_loss": 3.9655844999873726, + "tokens_seen": 444206080 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370611835506519, + "loss": 3.0452, + "theoretical_loss": 3.965520199612908, + "tokens_seen": 444271616 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043705115346038116, + "loss": 3.1507, + "theoretical_loss": 3.9654559113783074, + "tokens_seen": 444337152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370411233701103, + "loss": 2.9937, + "theoretical_loss": 3.9653916352794893, + "tokens_seen": 444402688 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370310932798395, + "loss": 3.2046, + "theoretical_loss": 3.9653273713123736, + "tokens_seen": 444468224 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370210631895687, + "loss": 3.0045, + "theoretical_loss": 3.9652631194728825, + "tokens_seen": 444533760 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004370110330992979, + "loss": 3.0764, + "theoretical_loss": 3.965198879756939, + "tokens_seen": 444599296 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043700100300902707, + "loss": 3.0115, + "theoretical_loss": 3.9651346521604696, + "tokens_seen": 444664832 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004369909729187563, + "loss": 2.9916, + "theoretical_loss": 3.9650704366794027, + "tokens_seen": 444730368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043698094282848543, + "loss": 3.0597, + "theoretical_loss": 3.9650062333096674, + "tokens_seen": 444795904 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043697091273821467, + "loss": 3.16, + "theoretical_loss": 3.9649420420471957, + "tokens_seen": 444861440 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004369608826479438, + "loss": 3.164, + "theoretical_loss": 3.9648778628879207, + "tokens_seen": 444926976 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043695085255767303, + "loss": 3.1326, + "theoretical_loss": 3.964813695827779, + "tokens_seen": 444992512 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004369408224674022, + "loss": 3.1883, + "theoretical_loss": 3.964749540862708, + "tokens_seen": 445058048 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004369307923771314, + "loss": 3.1156, + "theoretical_loss": 3.9646853979886467, + "tokens_seen": 445123584 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004369207622868606, + "loss": 3.2116, + "theoretical_loss": 3.9646212672015375, + "tokens_seen": 445189120 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043691073219658975, + "loss": 3.2974, + "theoretical_loss": 3.964557148497324, + "tokens_seen": 445254656 + }, + { + "epoch": 1.03, + "learning_rate": 0.000436900702106319, + "loss": 3.2234, + "theoretical_loss": 3.964493041871951, + "tokens_seen": 445320192 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043689067201604817, + "loss": 3.2115, + "theoretical_loss": 3.964428947321366, + "tokens_seen": 445385728 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043688064192577735, + "loss": 3.4197, + "theoretical_loss": 3.96436486484152, + "tokens_seen": 445451264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043687061183550653, + "loss": 3.3332, + "theoretical_loss": 3.964300794428362, + "tokens_seen": 445516800 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043686058174523577, + "loss": 3.0993, + "theoretical_loss": 3.964236736077847, + "tokens_seen": 445582336 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 733955, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3839240074157715, + "objective/train/theoretical_loss": 3.964188700228636, + "objective/train/tokens_used": 466091488, + "theoretical_loss": 3.964188700228636, + "tokens_seen": 445631488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004368505516549649, + "loss": 3.2042, + "theoretical_loss": 3.9641726897859293, + "tokens_seen": 445647872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043684052156469413, + "loss": 3.1635, + "theoretical_loss": 3.964108655548567, + "tokens_seen": 445713408 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043683049147442326, + "loss": 3.2453, + "theoretical_loss": 3.964044633361719, + "tokens_seen": 445778944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004368204613841525, + "loss": 3.1787, + "theoretical_loss": 3.963980623221346, + "tokens_seen": 445844480 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043681043129388167, + "loss": 3.1538, + "theoretical_loss": 3.963916625123412, + "tokens_seen": 445910016 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043680040120361085, + "loss": 3.2779, + "theoretical_loss": 3.96385263906388, + "tokens_seen": 445975552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043679037111334003, + "loss": 3.1595, + "theoretical_loss": 3.9637886650387197, + "tokens_seen": 446041088 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004367803410230692, + "loss": 3.2292, + "theoretical_loss": 3.963724703043898, + "tokens_seen": 446106624 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004367703109327984, + "loss": 3.2571, + "theoretical_loss": 3.963660753075387, + "tokens_seen": 446172160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043676028084252763, + "loss": 3.3299, + "theoretical_loss": 3.9635968151291583, + "tokens_seen": 446237696 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043675025075225676, + "loss": 3.2408, + "theoretical_loss": 3.9635328892011876, + "tokens_seen": 446303232 + }, + { + "epoch": 1.03, + "learning_rate": 0.000436740220661986, + "loss": 3.2247, + "theoretical_loss": 3.9634689752874515, + "tokens_seen": 446368768 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004367301905717151, + "loss": 3.1775, + "theoretical_loss": 3.9634050733839272, + "tokens_seen": 446434304 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043672016048144436, + "loss": 2.9888, + "theoretical_loss": 3.9633411834865977, + "tokens_seen": 446499840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043671013039117354, + "loss": 3.0954, + "theoretical_loss": 3.9632773055914434, + "tokens_seen": 446565376 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004367001003009027, + "loss": 3.1212, + "theoretical_loss": 3.96321343969445, + "tokens_seen": 446630912 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004366900702106319, + "loss": 3.202, + "theoretical_loss": 3.963149585791603, + "tokens_seen": 446696448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043668004012036113, + "loss": 3.2235, + "theoretical_loss": 3.963085743878891, + "tokens_seen": 446761984 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043667001003009026, + "loss": 3.0271, + "theoretical_loss": 3.963021913952304, + "tokens_seen": 446827520 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004366599799398195, + "loss": 3.1079, + "theoretical_loss": 3.962958096007835, + "tokens_seen": 446893056 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004366499498495486, + "loss": 3.2529, + "theoretical_loss": 3.9628942900414765, + "tokens_seen": 446958592 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043663991975927786, + "loss": 2.9971, + "theoretical_loss": 3.962830496049226, + "tokens_seen": 447024128 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043662988966900704, + "loss": 3.2448, + "theoretical_loss": 3.962766714027081, + "tokens_seen": 447089664 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004366198595787362, + "loss": 3.2233, + "theoretical_loss": 3.9627029439710406, + "tokens_seen": 447155200 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004366098294884654, + "loss": 3.0887, + "theoretical_loss": 3.9626391858771077, + "tokens_seen": 447220736 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 736807, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1511573791503906, + "objective/train/theoretical_loss": 3.962591375154386, + "objective/train/tokens_used": 467729888, + "theoretical_loss": 3.962591375154386, + "tokens_seen": 447269888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004365997993981946, + "loss": 3.2723, + "theoretical_loss": 3.9625754397412845, + "tokens_seen": 447286272 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043658976930792376, + "loss": 3.0396, + "theoretical_loss": 3.9625117055595784, + "tokens_seen": 447351808 + }, + { + "epoch": 1.03, + "learning_rate": 0.000436579739217653, + "loss": 3.1449, + "theoretical_loss": 3.962447983327996, + "tokens_seen": 447417344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043656970912738213, + "loss": 3.1901, + "theoretical_loss": 3.962384273042546, + "tokens_seen": 447482880 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043655967903711136, + "loss": 3.0105, + "theoretical_loss": 3.962320574699241, + "tokens_seen": 447548416 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004365496489468405, + "loss": 3.2413, + "theoretical_loss": 3.9622568882940943, + "tokens_seen": 447613952 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004365396188565697, + "loss": 3.2569, + "theoretical_loss": 3.9621932138231197, + "tokens_seen": 447679488 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004365295887662989, + "loss": 3.292, + "theoretical_loss": 3.9621295512823353, + "tokens_seen": 447745024 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004365195586760281, + "loss": 3.1418, + "theoretical_loss": 3.9620659006677608, + "tokens_seen": 447810560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043650952858575727, + "loss": 3.0334, + "theoretical_loss": 3.9620022619754156, + "tokens_seen": 447876096 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004364994984954865, + "loss": 3.2968, + "theoretical_loss": 3.9619386352013235, + "tokens_seen": 447941632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043648946840521563, + "loss": 3.1172, + "theoretical_loss": 3.961875020341509, + "tokens_seen": 448007168 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043647943831494487, + "loss": 3.1629, + "theoretical_loss": 3.961811417391999, + "tokens_seen": 448072704 + }, + { + "epoch": 1.03, + "learning_rate": 0.000436469408224674, + "loss": 2.8782, + "theoretical_loss": 3.9617478263488213, + "tokens_seen": 448138240 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043645937813440323, + "loss": 3.1548, + "theoretical_loss": 3.961684247208008, + "tokens_seen": 448203776 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004364493480441324, + "loss": 3.2047, + "theoretical_loss": 3.961620679965589, + "tokens_seen": 448269312 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004364393179538616, + "loss": 3.2486, + "theoretical_loss": 3.9615571246176002, + "tokens_seen": 448334848 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043642928786359077, + "loss": 3.0912, + "theoretical_loss": 3.961493581160078, + "tokens_seen": 448400384 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043641925777331995, + "loss": 3.1732, + "theoretical_loss": 3.96143004958906, + "tokens_seen": 448465920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043640922768304913, + "loss": 3.1623, + "theoretical_loss": 3.9613665299005856, + "tokens_seen": 448531456 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043639919759277837, + "loss": 3.0085, + "theoretical_loss": 3.961303022090697, + "tokens_seen": 448596992 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004363891675025075, + "loss": 3.2158, + "theoretical_loss": 3.9612395261554383, + "tokens_seen": 448662528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043637913741223673, + "loss": 3.0353, + "theoretical_loss": 3.9611760420908553, + "tokens_seen": 448728064 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004363691073219659, + "loss": 3.1293, + "theoretical_loss": 3.961112569892995, + "tokens_seen": 448793600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004363590772316951, + "loss": 3.2305, + "theoretical_loss": 3.9610491095579072, + "tokens_seen": 448859136 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 739629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9228355884552, + "objective/train/theoretical_loss": 3.96100152208916, + "objective/train/tokens_used": 469368288, + "theoretical_loss": 3.96100152208916, + "tokens_seen": 448908288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004363490471414243, + "loss": 2.8967, + "theoretical_loss": 3.9609856610816427, + "tokens_seen": 448924672 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043633901705115346, + "loss": 3.1046, + "theoretical_loss": 3.9609222244602558, + "tokens_seen": 448990208 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043632898696088264, + "loss": 2.9096, + "theoretical_loss": 3.9608587996898006, + "tokens_seen": 449055744 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043631895687061187, + "loss": 3.163, + "theoretical_loss": 3.9607953867663346, + "tokens_seen": 449121280 + }, + { + "epoch": 1.03, + "learning_rate": 0.000436308926780341, + "loss": 3.1926, + "theoretical_loss": 3.960731985685916, + "tokens_seen": 449186816 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043629889669007023, + "loss": 3.0089, + "theoretical_loss": 3.9606685964446067, + "tokens_seen": 449252352 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043628886659979936, + "loss": 3.0937, + "theoretical_loss": 3.960605219038469, + "tokens_seen": 449317888 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004362788365095286, + "loss": 3.0281, + "theoretical_loss": 3.9605418534635666, + "tokens_seen": 449383424 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004362688064192578, + "loss": 3.1259, + "theoretical_loss": 3.9604784997159665, + "tokens_seen": 449448960 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043625877632898696, + "loss": 2.9547, + "theoretical_loss": 3.960415157791738, + "tokens_seen": 449514496 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043624874623871614, + "loss": 3.0553, + "theoretical_loss": 3.96035182768695, + "tokens_seen": 449580032 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004362387161484453, + "loss": 3.055, + "theoretical_loss": 3.9602885093976745, + "tokens_seen": 449645568 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004362286860581745, + "loss": 3.0511, + "theoretical_loss": 3.960225202919986, + "tokens_seen": 449711104 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043621865596790374, + "loss": 3.0635, + "theoretical_loss": 3.96016190824996, + "tokens_seen": 449776640 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043620862587763286, + "loss": 2.9579, + "theoretical_loss": 3.9600986253836745, + "tokens_seen": 449842176 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361985957873621, + "loss": 3.0694, + "theoretical_loss": 3.960035354317209, + "tokens_seen": 449907712 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361885656970913, + "loss": 3.1092, + "theoretical_loss": 3.959972095046645, + "tokens_seen": 449973248 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043617853560682046, + "loss": 3.1693, + "theoretical_loss": 3.9599088475680655, + "tokens_seen": 450038784 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361685055165497, + "loss": 2.9348, + "theoretical_loss": 3.959845611877556, + "tokens_seen": 450104320 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361584754262788, + "loss": 3.1002, + "theoretical_loss": 3.9597823879712033, + "tokens_seen": 450169856 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043614844533600806, + "loss": 3.1982, + "theoretical_loss": 3.9597191758450965, + "tokens_seen": 450235392 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043613841524573724, + "loss": 2.9684, + "theoretical_loss": 3.959655975495326, + "tokens_seen": 450300928 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361283851554664, + "loss": 2.962, + "theoretical_loss": 3.9595927869179857, + "tokens_seen": 450366464 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361183550651956, + "loss": 2.9162, + "theoretical_loss": 3.9595296101091684, + "tokens_seen": 450432000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004361083249749248, + "loss": 3.1875, + "theoretical_loss": 3.9594664450649724, + "tokens_seen": 450497536 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 742386, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.108018398284912, + "objective/train/theoretical_loss": 3.9594190790000097, + "objective/train/tokens_used": 471006688, + "theoretical_loss": 3.9594190790000097, + "tokens_seen": 450546688 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043609829488465397, + "loss": 3.2213, + "theoretical_loss": 3.959403291781494, + "tokens_seen": 450563072 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360882647943832, + "loss": 3.0581, + "theoretical_loss": 3.959340150254834, + "tokens_seen": 450628608 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043607823470411233, + "loss": 3.2041, + "theoretical_loss": 3.959277020481095, + "tokens_seen": 450694144 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043606820461384156, + "loss": 2.8778, + "theoretical_loss": 3.95921390245638, + "tokens_seen": 450759680 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360581745235707, + "loss": 3.0293, + "theoretical_loss": 3.959150796176795, + "tokens_seen": 450825216 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360481444332999, + "loss": 3.2243, + "theoretical_loss": 3.959087701638448, + "tokens_seen": 450890752 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360381143430291, + "loss": 3.121, + "theoretical_loss": 3.9590246188374474, + "tokens_seen": 450956288 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360280842527583, + "loss": 3.1588, + "theoretical_loss": 3.958961547769906, + "tokens_seen": 451021824 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043601805416248747, + "loss": 3.1745, + "theoretical_loss": 3.958898488431935, + "tokens_seen": 451087360 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004360080240722167, + "loss": 3.1393, + "theoretical_loss": 3.9588354408196507, + "tokens_seen": 451152896 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043599799398194583, + "loss": 3.1834, + "theoretical_loss": 3.9587724049291695, + "tokens_seen": 451218432 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043598796389167507, + "loss": 3.1469, + "theoretical_loss": 3.95870938075661, + "tokens_seen": 451283968 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004359779338014042, + "loss": 3.0457, + "theoretical_loss": 3.9586463682980924, + "tokens_seen": 451349504 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043596790371113343, + "loss": 3.0765, + "theoretical_loss": 3.95858336754974, + "tokens_seen": 451415040 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004359578736208626, + "loss": 3.1174, + "theoretical_loss": 3.958520378507676, + "tokens_seen": 451480576 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004359478435305918, + "loss": 3.1763, + "theoretical_loss": 3.958457401168027, + "tokens_seen": 451546112 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043593781344032097, + "loss": 3.0972, + "theoretical_loss": 3.958394435526921, + "tokens_seen": 451611648 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043592778335005015, + "loss": 3.1819, + "theoretical_loss": 3.958331481580487, + "tokens_seen": 451677184 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043591775325977933, + "loss": 3.2253, + "theoretical_loss": 3.9582685393248576, + "tokens_seen": 451742720 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043590772316950857, + "loss": 3.1822, + "theoretical_loss": 3.9582056087561655, + "tokens_seen": 451808256 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004358976930792377, + "loss": 3.1323, + "theoretical_loss": 3.958142689870546, + "tokens_seen": 451873792 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043588766298896693, + "loss": 2.8955, + "theoretical_loss": 3.958079782664136, + "tokens_seen": 451939328 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004358776328986961, + "loss": 3.2505, + "theoretical_loss": 3.958016887133075, + "tokens_seen": 452004864 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004358676028084253, + "loss": 3.1739, + "theoretical_loss": 3.957954003273504, + "tokens_seen": 452070400 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004358575727181545, + "loss": 3.2498, + "theoretical_loss": 3.9578911310815643, + "tokens_seen": 452135936 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 743743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0118894577026367, + "objective/train/theoretical_loss": 3.957843984592174, + "objective/train/tokens_used": 472645088, + "theoretical_loss": 3.957843984592174, + "tokens_seen": 452185088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043584754262788366, + "loss": 3.0822, + "theoretical_loss": 3.957828270553402, + "tokens_seen": 452201472 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043583751253761284, + "loss": 2.9364, + "theoretical_loss": 3.9577654216851617, + "tokens_seen": 452267008 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043582748244734207, + "loss": 3.2063, + "theoretical_loss": 3.957702584472993, + "tokens_seen": 452332544 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004358174523570712, + "loss": 3.1231, + "theoretical_loss": 3.9576397589130448, + "tokens_seen": 452398080 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043580742226680043, + "loss": 3.149, + "theoretical_loss": 3.9575769450014686, + "tokens_seen": 452463616 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043579739217652956, + "loss": 3.0001, + "theoretical_loss": 3.9575141427344196, + "tokens_seen": 452529152 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004357873620862588, + "loss": 2.9266, + "theoretical_loss": 3.957451352108052, + "tokens_seen": 452594688 + }, + { + "epoch": 1.03, + "learning_rate": 0.000435777331995988, + "loss": 3.1535, + "theoretical_loss": 3.9573885731185223, + "tokens_seen": 452660224 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043576730190571716, + "loss": 3.0883, + "theoretical_loss": 3.9573258057619913, + "tokens_seen": 452725760 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043575727181544634, + "loss": 3.233, + "theoretical_loss": 3.957263050034619, + "tokens_seen": 452791296 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004357472417251755, + "loss": 3.3292, + "theoretical_loss": 3.957200305932568, + "tokens_seen": 452856832 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004357372116349047, + "loss": 3.1955, + "theoretical_loss": 3.957137573452003, + "tokens_seen": 452922368 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043572718154463394, + "loss": 3.1982, + "theoretical_loss": 3.957074852589091, + "tokens_seen": 452987904 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043571715145436306, + "loss": 2.9341, + "theoretical_loss": 3.9570121433399987, + "tokens_seen": 453053440 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004357071213640923, + "loss": 3.007, + "theoretical_loss": 3.9569494457008973, + "tokens_seen": 453118976 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004356970912738215, + "loss": 3.1097, + "theoretical_loss": 3.9568867596679578, + "tokens_seen": 453184512 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043568706118355066, + "loss": 3.0963, + "theoretical_loss": 3.956824085237355, + "tokens_seen": 453250048 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043567703109327984, + "loss": 3.1629, + "theoretical_loss": 3.956761422405263, + "tokens_seen": 453315584 + }, + { + "epoch": 1.03, + "learning_rate": 0.000435667001003009, + "loss": 3.0768, + "theoretical_loss": 3.9566987711678596, + "tokens_seen": 453381120 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004356569709127382, + "loss": 3.213, + "theoretical_loss": 3.956636131521324, + "tokens_seen": 453446656 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043564694082246744, + "loss": 3.1878, + "theoretical_loss": 3.9565735034618372, + "tokens_seen": 453512192 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043563691073219657, + "loss": 3.0902, + "theoretical_loss": 3.9565108869855816, + "tokens_seen": 453577728 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004356268806419258, + "loss": 2.9694, + "theoretical_loss": 3.9564482820887417, + "tokens_seen": 453643264 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043561685055165493, + "loss": 3.2022, + "theoretical_loss": 3.9563856887675035, + "tokens_seen": 453708800 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043560682046138417, + "loss": 3.1296, + "theoretical_loss": 3.956323107018056, + "tokens_seen": 453774336 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 746173, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7517871856689453, + "objective/train/theoretical_loss": 3.956276178297665, + "objective/train/tokens_used": 474283488, + "theoretical_loss": 3.956276178297665, + "tokens_seen": 453823488 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043559679037111335, + "loss": 3.0579, + "theoretical_loss": 3.956260536836588, + "tokens_seen": 453839872 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043558676028084253, + "loss": 3.1933, + "theoretical_loss": 3.9561979782192918, + "tokens_seen": 453905408 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004355767301905717, + "loss": 2.9962, + "theoretical_loss": 3.956135431162361, + "tokens_seen": 453970944 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004355667001003009, + "loss": 2.9114, + "theoretical_loss": 3.9560728956619906, + "tokens_seen": 454036480 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043555667001003007, + "loss": 3.3007, + "theoretical_loss": 3.956010371714378, + "tokens_seen": 454102016 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004355466399197593, + "loss": 3.1226, + "theoretical_loss": 3.9559478593157222, + "tokens_seen": 454167552 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043553660982948843, + "loss": 3.1256, + "theoretical_loss": 3.9558853584622238, + "tokens_seen": 454233088 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043552657973921767, + "loss": 3.115, + "theoretical_loss": 3.955822869150085, + "tokens_seen": 454298624 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043551654964894685, + "loss": 3.2156, + "theoretical_loss": 3.9557603913755104, + "tokens_seen": 454364160 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043550651955867603, + "loss": 3.0953, + "theoretical_loss": 3.9556979251347064, + "tokens_seen": 454429696 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354964894684052, + "loss": 3.1526, + "theoretical_loss": 3.9556354704238803, + "tokens_seen": 454495232 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354864593781344, + "loss": 3.1596, + "theoretical_loss": 3.955573027239242, + "tokens_seen": 454560768 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354764292878636, + "loss": 3.0451, + "theoretical_loss": 3.955510595577003, + "tokens_seen": 454626304 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354663991975928, + "loss": 3.0331, + "theoretical_loss": 3.9554481754333772, + "tokens_seen": 454691840 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043545636910732194, + "loss": 3.0359, + "theoretical_loss": 3.955385766804579, + "tokens_seen": 454757376 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043544633901705117, + "loss": 3.0463, + "theoretical_loss": 3.9553233696868255, + "tokens_seen": 454822912 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354363089267803, + "loss": 3.1862, + "theoretical_loss": 3.9552609840763346, + "tokens_seen": 454888448 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043542627883650953, + "loss": 3.0491, + "theoretical_loss": 3.9551986099693277, + "tokens_seen": 454953984 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043541624874623877, + "loss": 3.0952, + "theoretical_loss": 3.9551362473620273, + "tokens_seen": 455019520 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004354062186559679, + "loss": 3.0485, + "theoretical_loss": 3.9550738962506564, + "tokens_seen": 455085056 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043539618856569713, + "loss": 3.1036, + "theoretical_loss": 3.9550115566314403, + "tokens_seen": 455150592 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004353861584754263, + "loss": 3.0012, + "theoretical_loss": 3.9549492285006087, + "tokens_seen": 455216128 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004353761283851555, + "loss": 3.2378, + "theoretical_loss": 3.9548869118543895, + "tokens_seen": 455281664 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004353660982948847, + "loss": 3.1913, + "theoretical_loss": 3.954824606689013, + "tokens_seen": 455347200 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043535606820461386, + "loss": 3.2003, + "theoretical_loss": 3.9547623130007143, + "tokens_seen": 455412736 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 748965, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.216320753097534, + "objective/train/theoretical_loss": 3.9547156002640564, + "objective/train/tokens_used": 475921888, + "theoretical_loss": 3.9547156002640564, + "tokens_seen": 455461888 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043534603811434304, + "loss": 3.2401, + "theoretical_loss": 3.9547000307857267, + "tokens_seen": 455478272 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043533600802407227, + "loss": 3.1024, + "theoretical_loss": 3.9546377600402867, + "tokens_seen": 455543808 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004353259779338014, + "loss": 3.0719, + "theoretical_loss": 3.954575500760633, + "tokens_seen": 455609344 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043531594784353063, + "loss": 3.158, + "theoretical_loss": 3.954513252943005, + "tokens_seen": 455674880 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043530591775325976, + "loss": 3.1008, + "theoretical_loss": 3.9544510165836453, + "tokens_seen": 455740416 + }, + { + "epoch": 1.03, + "learning_rate": 0.000435295887662989, + "loss": 3.1483, + "theoretical_loss": 3.954388791678796, + "tokens_seen": 455805952 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004352858575727182, + "loss": 3.1748, + "theoretical_loss": 3.9543265782247046, + "tokens_seen": 455871488 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043527582748244736, + "loss": 3.0844, + "theoretical_loss": 3.9542643762176164, + "tokens_seen": 455937024 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043526579739217654, + "loss": 3.078, + "theoretical_loss": 3.9542021856537817, + "tokens_seen": 456002560 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004352557673019057, + "loss": 3.1622, + "theoretical_loss": 3.9541400065294496, + "tokens_seen": 456068096 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004352457372116349, + "loss": 3.1509, + "theoretical_loss": 3.9540778388408735, + "tokens_seen": 456133632 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043523570712136414, + "loss": 3.0387, + "theoretical_loss": 3.9540156825843074, + "tokens_seen": 456199168 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043522567703109326, + "loss": 3.1938, + "theoretical_loss": 3.953953537756007, + "tokens_seen": 456264704 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004352156469408225, + "loss": 3.1117, + "theoretical_loss": 3.9538914043522304, + "tokens_seen": 456330240 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004352056168505517, + "loss": 3.1602, + "theoretical_loss": 3.953829282369237, + "tokens_seen": 456395776 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043519558676028086, + "loss": 3.2633, + "theoretical_loss": 3.9537671718032876, + "tokens_seen": 456461312 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043518555667001004, + "loss": 3.0574, + "theoretical_loss": 3.9537050726506457, + "tokens_seen": 456526848 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004351755265797392, + "loss": 3.1759, + "theoretical_loss": 3.9536429849075754, + "tokens_seen": 456592384 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004351654964894684, + "loss": 3.0771, + "theoretical_loss": 3.953580908570344, + "tokens_seen": 456657920 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043515546639919764, + "loss": 3.2253, + "theoretical_loss": 3.9535188436352193, + "tokens_seen": 456723456 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043514543630892677, + "loss": 3.2382, + "theoretical_loss": 3.9534567900984716, + "tokens_seen": 456788992 + }, + { + "epoch": 1.03, + "learning_rate": 0.000435135406218656, + "loss": 3.2017, + "theoretical_loss": 3.9533947479563722, + "tokens_seen": 456854528 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043512537612838513, + "loss": 3.3393, + "theoretical_loss": 3.953332717205195, + "tokens_seen": 456920064 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043511534603811437, + "loss": 3.052, + "theoretical_loss": 3.953270697841215, + "tokens_seen": 456985600 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043510531594784355, + "loss": 3.2345, + "theoretical_loss": 3.95320868986071, + "tokens_seen": 457051136 + }, + { + "epoch": 1.03, + "objective/train/docs_used": 751562, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4195444583892822, + "objective/train/theoretical_loss": 3.953162191343497, + "objective/train/tokens_used": 477560288, + "theoretical_loss": 3.953162191343497, + "tokens_seen": 457100288 + }, + { + "epoch": 1.03, + "learning_rate": 0.00043509528585757273, + "loss": 2.9831, + "theoretical_loss": 3.9531466932599573, + "tokens_seen": 457116672 + }, + { + "epoch": 1.03, + "learning_rate": 0.0004350852557673019, + "loss": 3.3295, + "theoretical_loss": 3.953084708035239, + "tokens_seen": 457182208 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004350752256770311, + "loss": 3.0914, + "theoretical_loss": 3.9530227341828366, + "tokens_seen": 457247744 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043506519558676027, + "loss": 3.1507, + "theoretical_loss": 3.9529607716990336, + "tokens_seen": 457313280 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004350551654964895, + "loss": 2.9907, + "theoretical_loss": 3.952898820580117, + "tokens_seen": 457378816 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043504513540621863, + "loss": 3.0711, + "theoretical_loss": 3.9528368808223737, + "tokens_seen": 457444352 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043503510531594787, + "loss": 3.1412, + "theoretical_loss": 3.952774952422093, + "tokens_seen": 457509888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043502507522567705, + "loss": 3.2076, + "theoretical_loss": 3.952713035375566, + "tokens_seen": 457575424 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043501504513540623, + "loss": 3.0681, + "theoretical_loss": 3.9526511296790856, + "tokens_seen": 457640960 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004350050150451354, + "loss": 3.0957, + "theoretical_loss": 3.9525892353289453, + "tokens_seen": 457706496 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349949849548646, + "loss": 3.0852, + "theoretical_loss": 3.952527352321443, + "tokens_seen": 457772032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349849548645938, + "loss": 3.1163, + "theoretical_loss": 3.952465480652875, + "tokens_seen": 457837568 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434974924774323, + "loss": 3.1071, + "theoretical_loss": 3.952403620319542, + "tokens_seen": 457903104 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043496489468405214, + "loss": 3.2045, + "theoretical_loss": 3.9523417713177453, + "tokens_seen": 457968640 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043495486459378137, + "loss": 3.1525, + "theoretical_loss": 3.952279933643788, + "tokens_seen": 458034176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349448345035105, + "loss": 3.2657, + "theoretical_loss": 3.952218107293975, + "tokens_seen": 458099712 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043493480441323973, + "loss": 3.1175, + "theoretical_loss": 3.952156292264613, + "tokens_seen": 458165248 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349247743229689, + "loss": 2.9936, + "theoretical_loss": 3.95209448855201, + "tokens_seen": 458230784 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349147442326981, + "loss": 2.9859, + "theoretical_loss": 3.952032696152477, + "tokens_seen": 458296320 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004349047141424273, + "loss": 3.067, + "theoretical_loss": 3.951970915062325, + "tokens_seen": 458361856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004348946840521565, + "loss": 3.0674, + "theoretical_loss": 3.9519091452778676, + "tokens_seen": 458427392 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043488465396188564, + "loss": 3.1001, + "theoretical_loss": 3.951847386795421, + "tokens_seen": 458492928 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004348746238716149, + "loss": 3.0428, + "theoretical_loss": 3.9517856396113014, + "tokens_seen": 458558464 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434864593781344, + "loss": 3.0529, + "theoretical_loss": 3.9517239037218275, + "tokens_seen": 458624000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043485456369107324, + "loss": 3.1602, + "theoretical_loss": 3.9516621791233204, + "tokens_seen": 458689536 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 754498, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0314295291900635, + "objective/train/theoretical_loss": 3.9516158930819243, + "objective/train/tokens_used": 479198688, + "theoretical_loss": 3.9516158930819243, + "tokens_seen": 458738688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004348445336008024, + "loss": 3.0706, + "theoretical_loss": 3.951600465812102, + "tokens_seen": 458755072 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004348345035105316, + "loss": 3.0259, + "theoretical_loss": 3.9515387637844963, + "tokens_seen": 458820608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004348244734202608, + "loss": 3.2219, + "theoretical_loss": 3.9514770730368283, + "tokens_seen": 458886144 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043481444332998996, + "loss": 3.1716, + "theoretical_loss": 3.9514153935654264, + "tokens_seen": 458951680 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043480441323971914, + "loss": 3.2016, + "theoretical_loss": 3.9513537253666184, + "tokens_seen": 459017216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004347943831494484, + "loss": 2.9939, + "theoretical_loss": 3.9512920684367367, + "tokens_seen": 459082752 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004347843530591775, + "loss": 3.2167, + "theoretical_loss": 3.951230422772113, + "tokens_seen": 459148288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043477432296890674, + "loss": 3.0094, + "theoretical_loss": 3.9511687883690816, + "tokens_seen": 459213824 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043476429287863587, + "loss": 2.9901, + "theoretical_loss": 3.951107165223978, + "tokens_seen": 459279360 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004347542627883651, + "loss": 3.1171, + "theoretical_loss": 3.951045553333141, + "tokens_seen": 459344896 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004347442326980943, + "loss": 3.1589, + "theoretical_loss": 3.950983952692909, + "tokens_seen": 459410432 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043473420260782346, + "loss": 3.2768, + "theoretical_loss": 3.950922363299623, + "tokens_seen": 459475968 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043472417251755265, + "loss": 3.3052, + "theoretical_loss": 3.9508607851496267, + "tokens_seen": 459541504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004347141424272819, + "loss": 3.1576, + "theoretical_loss": 3.9507992182392644, + "tokens_seen": 459607040 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434704112337011, + "loss": 2.9916, + "theoretical_loss": 3.9507376625648822, + "tokens_seen": 459672576 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043469408224674024, + "loss": 3.2688, + "theoretical_loss": 3.9506761181228276, + "tokens_seen": 459738112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004346840521564694, + "loss": 3.2259, + "theoretical_loss": 3.9506145849094505, + "tokens_seen": 459803648 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004346740220661986, + "loss": 3.1693, + "theoretical_loss": 3.9505530629211023, + "tokens_seen": 459869184 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043466399197592784, + "loss": 3.0936, + "theoretical_loss": 3.950491552154136, + "tokens_seen": 459934720 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043465396188565697, + "loss": 3.0363, + "theoretical_loss": 3.950430052604907, + "tokens_seen": 460000256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004346439317953862, + "loss": 3.095, + "theoretical_loss": 3.9503685642697706, + "tokens_seen": 460065792 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043463390170511533, + "loss": 3.192, + "theoretical_loss": 3.9503070871450863, + "tokens_seen": 460131328 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043462387161484457, + "loss": 3.0756, + "theoretical_loss": 3.950245621227213, + "tokens_seen": 460196864 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043461384152457375, + "loss": 3.0462, + "theoretical_loss": 3.9501841665125124, + "tokens_seen": 460262400 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043460381143430293, + "loss": 3.1849, + "theoretical_loss": 3.9501227229973486, + "tokens_seen": 460327936 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 757423, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.825490951538086, + "objective/train/theoretical_loss": 3.9500766477084843, + "objective/train/tokens_used": 480837088, + "theoretical_loss": 3.9500766477084843, + "tokens_seen": 460377088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004345937813440321, + "loss": 3.0225, + "theoretical_loss": 3.950061290678085, + "tokens_seen": 460393472 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004345837512537613, + "loss": 3.1007, + "theoretical_loss": 3.94999986955109, + "tokens_seen": 460459008 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043457372116349047, + "loss": 3.1847, + "theoretical_loss": 3.9499384596127305, + "tokens_seen": 460524544 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004345636910732197, + "loss": 2.9931, + "theoretical_loss": 3.949877060859378, + "tokens_seen": 460590080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043455366098294883, + "loss": 3.2473, + "theoretical_loss": 3.949815673287403, + "tokens_seen": 460655616 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043454363089267807, + "loss": 3.1583, + "theoretical_loss": 3.949754296893179, + "tokens_seen": 460721152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043453360080240725, + "loss": 3.0613, + "theoretical_loss": 3.949692931673082, + "tokens_seen": 460786688 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043452357071213643, + "loss": 3.0041, + "theoretical_loss": 3.9496315776234883, + "tokens_seen": 460852224 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004345135406218656, + "loss": 3.1978, + "theoretical_loss": 3.949570234740776, + "tokens_seen": 460917760 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004345035105315948, + "loss": 3.1973, + "theoretical_loss": 3.949508903021327, + "tokens_seen": 460983296 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434493480441324, + "loss": 3.195, + "theoretical_loss": 3.949447582461521, + "tokens_seen": 461048832 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344834503510532, + "loss": 3.0423, + "theoretical_loss": 3.9493862730577427, + "tokens_seen": 461114368 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043447342026078234, + "loss": 3.0607, + "theoretical_loss": 3.9493249748063777, + "tokens_seen": 461179904 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043446339017051157, + "loss": 3.2372, + "theoretical_loss": 3.949263687703812, + "tokens_seen": 461245440 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344533600802407, + "loss": 3.0677, + "theoretical_loss": 3.949202411746435, + "tokens_seen": 461310976 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043444332998996993, + "loss": 2.9868, + "theoretical_loss": 3.9491411469306366, + "tokens_seen": 461376512 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344332998996991, + "loss": 3.0223, + "theoretical_loss": 3.949079893252809, + "tokens_seen": 461442048 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344232698094283, + "loss": 3.1766, + "theoretical_loss": 3.9490186507093457, + "tokens_seen": 461507584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344132397191575, + "loss": 3.2687, + "theoretical_loss": 3.9489574192966423, + "tokens_seen": 461573120 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004344032096288867, + "loss": 3.1689, + "theoretical_loss": 3.948896199011096, + "tokens_seen": 461638656 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043439317953861584, + "loss": 3.1798, + "theoretical_loss": 3.9488349898491046, + "tokens_seen": 461704192 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004343831494483451, + "loss": 3.0751, + "theoretical_loss": 3.9487737918070698, + "tokens_seen": 461769728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004343731193580742, + "loss": 2.7853, + "theoretical_loss": 3.9487126048813925, + "tokens_seen": 461835264 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043436308926780344, + "loss": 3.014, + "theoretical_loss": 3.9486514290684767, + "tokens_seen": 461900800 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004343530591775326, + "loss": 3.099, + "theoretical_loss": 3.9485902643647286, + "tokens_seen": 461966336 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 760065, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.67655348777771, + "objective/train/theoretical_loss": 3.948544398125147, + "objective/train/tokens_used": 482475488, + "theoretical_loss": 3.948544398125147, + "tokens_seen": 462015488 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004343430290872618, + "loss": 3.0523, + "theoretical_loss": 3.9485291107665548, + "tokens_seen": 462031872 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434332998996991, + "loss": 3.1013, + "theoretical_loss": 3.948467968270364, + "tokens_seen": 462097408 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043432296890672016, + "loss": 3.0063, + "theoretical_loss": 3.948406836872566, + "tokens_seen": 462162944 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043431293881644934, + "loss": 3.0804, + "theoretical_loss": 3.948345716569574, + "tokens_seen": 462228480 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004343029087261786, + "loss": 3.0825, + "theoretical_loss": 3.9482846073578015, + "tokens_seen": 462294016 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004342928786359077, + "loss": 3.2808, + "theoretical_loss": 3.9482235092336637, + "tokens_seen": 462359552 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043428284854563694, + "loss": 3.1333, + "theoretical_loss": 3.948162422193578, + "tokens_seen": 462425088 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043427281845536607, + "loss": 3.0625, + "theoretical_loss": 3.948101346233962, + "tokens_seen": 462490624 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004342627883650953, + "loss": 3.2159, + "theoretical_loss": 3.9480402813512376, + "tokens_seen": 462556160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004342527582748245, + "loss": 3.0322, + "theoretical_loss": 3.9479792275418264, + "tokens_seen": 462621696 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043424272818455367, + "loss": 3.1052, + "theoretical_loss": 3.947918184802152, + "tokens_seen": 462687232 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043423269809428285, + "loss": 3.1556, + "theoretical_loss": 3.9478571531286395, + "tokens_seen": 462752768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004342226680040121, + "loss": 3.0631, + "theoretical_loss": 3.947796132517717, + "tokens_seen": 462818304 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004342126379137412, + "loss": 3.0463, + "theoretical_loss": 3.9477351229658124, + "tokens_seen": 462883840 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043420260782347044, + "loss": 3.3275, + "theoretical_loss": 3.9476741244693567, + "tokens_seen": 462949376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043419257773319957, + "loss": 3.0923, + "theoretical_loss": 3.947613137024781, + "tokens_seen": 463014912 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004341825476429288, + "loss": 3.1644, + "theoretical_loss": 3.9475521606285198, + "tokens_seen": 463080448 + }, + { + "epoch": 1.04, + "learning_rate": 0.000434172517552658, + "loss": 3.1255, + "theoretical_loss": 3.947491195277008, + "tokens_seen": 463145984 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043416248746238717, + "loss": 2.9443, + "theoretical_loss": 3.9474302409666837, + "tokens_seen": 463211520 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043415245737211635, + "loss": 3.2046, + "theoretical_loss": 3.9473692976939843, + "tokens_seen": 463277056 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043414242728184553, + "loss": 3.2371, + "theoretical_loss": 3.94730836545535, + "tokens_seen": 463342592 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004341323971915747, + "loss": 3.1343, + "theoretical_loss": 3.947247444247224, + "tokens_seen": 463408128 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043412236710130395, + "loss": 3.0322, + "theoretical_loss": 3.947186534066049, + "tokens_seen": 463473664 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004341123370110331, + "loss": 3.106, + "theoretical_loss": 3.9471256349082706, + "tokens_seen": 463539200 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004341023069207623, + "loss": 3.1393, + "theoretical_loss": 3.9470647467703364, + "tokens_seen": 463604736 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 762852, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.073610782623291, + "objective/train/theoretical_loss": 3.9470190878965203, + "objective/train/tokens_used": 484113888, + "theoretical_loss": 3.9470190878965203, + "tokens_seen": 463653888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043409227683049144, + "loss": 3.1357, + "theoretical_loss": 3.947003869648693, + "tokens_seen": 463670272 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043408224674022067, + "loss": 3.1762, + "theoretical_loss": 3.9469430035397925, + "tokens_seen": 463735808 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043407221664994985, + "loss": 3.171, + "theoretical_loss": 3.946882148440086, + "tokens_seen": 463801344 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043406218655967903, + "loss": 3.2484, + "theoretical_loss": 3.9468213043460274, + "tokens_seen": 463866880 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004340521564694082, + "loss": 2.9702, + "theoretical_loss": 3.9467604712540716, + "tokens_seen": 463932416 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043404212637913745, + "loss": 3.1065, + "theoretical_loss": 3.9466996491606747, + "tokens_seen": 463997952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004340320962888666, + "loss": 2.9486, + "theoretical_loss": 3.9466388380622965, + "tokens_seen": 464063488 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004340220661985958, + "loss": 3.2203, + "theoretical_loss": 3.946578037955396, + "tokens_seen": 464129024 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043401203610832494, + "loss": 2.9769, + "theoretical_loss": 3.946517248836436, + "tokens_seen": 464194560 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004340020060180542, + "loss": 3.0524, + "theoretical_loss": 3.9464564707018788, + "tokens_seen": 464260096 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043399197592778336, + "loss": 2.9382, + "theoretical_loss": 3.946395703548189, + "tokens_seen": 464325632 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043398194583751254, + "loss": 3.1287, + "theoretical_loss": 3.9463349473718345, + "tokens_seen": 464391168 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339719157472417, + "loss": 3.1285, + "theoretical_loss": 3.9462742021692834, + "tokens_seen": 464456704 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339618856569709, + "loss": 3.0722, + "theoretical_loss": 3.946213467937005, + "tokens_seen": 464522240 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339518555667001, + "loss": 3.1548, + "theoretical_loss": 3.946152744671471, + "tokens_seen": 464587776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339418254764293, + "loss": 3.0198, + "theoretical_loss": 3.9460920323691546, + "tokens_seen": 464653312 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339317953861585, + "loss": 3.0828, + "theoretical_loss": 3.9460313310265307, + "tokens_seen": 464718848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339217652958877, + "loss": 3.1819, + "theoretical_loss": 3.9459706406400756, + "tokens_seen": 464784384 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004339117352056169, + "loss": 3.0181, + "theoretical_loss": 3.945909961206267, + "tokens_seen": 464849920 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043390170511534604, + "loss": 3.1439, + "theoretical_loss": 3.9458492927215856, + "tokens_seen": 464915456 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338916750250753, + "loss": 3.066, + "theoretical_loss": 3.9457886351825118, + "tokens_seen": 464980992 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338816449348044, + "loss": 3.1743, + "theoretical_loss": 3.9457279885855288, + "tokens_seen": 465046528 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043387161484453364, + "loss": 3.1783, + "theoretical_loss": 3.9456673529271216, + "tokens_seen": 465112064 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338615847542628, + "loss": 3.0094, + "theoretical_loss": 3.9456067282037752, + "tokens_seen": 465177600 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433851554663992, + "loss": 3.1578, + "theoretical_loss": 3.9455461144119788, + "tokens_seen": 465243136 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 765522, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3187942504882812, + "objective/train/theoretical_loss": 3.9455006612398487, + "objective/train/tokens_used": 485752288, + "theoretical_loss": 3.9455006612398487, + "tokens_seen": 465292288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338415245737212, + "loss": 3.1406, + "theoretical_loss": 3.9454855115482212, + "tokens_seen": 465308672 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043383149448345036, + "loss": 3.0079, + "theoretical_loss": 3.9454249196089934, + "tokens_seen": 465374208 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043382146439317954, + "loss": 3.0602, + "theoretical_loss": 3.9453643385907875, + "tokens_seen": 465439744 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338114343029088, + "loss": 3.2425, + "theoretical_loss": 3.945303768490099, + "tokens_seen": 465505280 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004338014042126379, + "loss": 3.1216, + "theoretical_loss": 3.9452432093034235, + "tokens_seen": 465570816 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043379137412236714, + "loss": 3.0807, + "theoretical_loss": 3.945182661027258, + "tokens_seen": 465636352 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043378134403209627, + "loss": 3.0794, + "theoretical_loss": 3.9451221236581016, + "tokens_seen": 465701888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004337713139418255, + "loss": 3.1878, + "theoretical_loss": 3.945061597192456, + "tokens_seen": 465767424 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004337612838515547, + "loss": 3.0178, + "theoretical_loss": 3.9450010816268226, + "tokens_seen": 465832960 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043375125376128387, + "loss": 3.1069, + "theoretical_loss": 3.9449405769577055, + "tokens_seen": 465898496 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043374122367101305, + "loss": 3.1282, + "theoretical_loss": 3.9448800831816113, + "tokens_seen": 465964032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004337311935807423, + "loss": 3.1285, + "theoretical_loss": 3.9448196002950455, + "tokens_seen": 466029568 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004337211634904714, + "loss": 3.0764, + "theoretical_loss": 3.9447591282945185, + "tokens_seen": 466095104 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043371113340020064, + "loss": 3.2411, + "theoretical_loss": 3.94469866717654, + "tokens_seen": 466160640 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043370110330992977, + "loss": 3.0114, + "theoretical_loss": 3.9446382169376224, + "tokens_seen": 466226176 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433691073219659, + "loss": 3.1615, + "theoretical_loss": 3.944577777574279, + "tokens_seen": 466291712 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004336810431293882, + "loss": 3.2067, + "theoretical_loss": 3.944517349083025, + "tokens_seen": 466357248 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043367101303911737, + "loss": 3.1952, + "theoretical_loss": 3.944456931460378, + "tokens_seen": 466422784 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043366098294884655, + "loss": 3.0525, + "theoretical_loss": 3.9443965247028556, + "tokens_seen": 466488320 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043365095285857573, + "loss": 3.1558, + "theoretical_loss": 3.9443361288069783, + "tokens_seen": 466553856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004336409227683049, + "loss": 3.0963, + "theoretical_loss": 3.9442757437692673, + "tokens_seen": 466619392 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043363089267803415, + "loss": 3.2012, + "theoretical_loss": 3.9442153695862467, + "tokens_seen": 466684928 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004336208625877633, + "loss": 2.9936, + "theoretical_loss": 3.944155006254441, + "tokens_seen": 466750464 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004336108324974925, + "loss": 3.1377, + "theoretical_loss": 3.9440946537703767, + "tokens_seen": 466816000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043360080240722164, + "loss": 3.0918, + "theoretical_loss": 3.9440343121305816, + "tokens_seen": 466881536 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 766913, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4110634326934814, + "objective/train/theoretical_loss": 3.9439890630151995, + "objective/train/tokens_used": 487390688, + "theoretical_loss": 3.9439890630151995, + "tokens_seen": 466930688 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043359077231695087, + "loss": 3.2692, + "theoretical_loss": 3.9439739813315855, + "tokens_seen": 466947072 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043358074222668005, + "loss": 3.2489, + "theoretical_loss": 3.94391366136992, + "tokens_seen": 467012608 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043357071213640923, + "loss": 3.064, + "theoretical_loss": 3.943853352242118, + "tokens_seen": 467078144 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004335606820461384, + "loss": 3.1732, + "theoretical_loss": 3.943793053944713, + "tokens_seen": 467143680 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043355065195586765, + "loss": 3.0888, + "theoretical_loss": 3.943732766474242, + "tokens_seen": 467209216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004335406218655968, + "loss": 3.1568, + "theoretical_loss": 3.943672489827243, + "tokens_seen": 467274752 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433530591775326, + "loss": 2.9235, + "theoretical_loss": 3.9436122240002547, + "tokens_seen": 467340288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043352056168505514, + "loss": 3.0986, + "theoretical_loss": 3.9435519689898175, + "tokens_seen": 467405824 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004335105315947844, + "loss": 3.038, + "theoretical_loss": 3.943491724792475, + "tokens_seen": 467471360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043350050150451356, + "loss": 3.219, + "theoretical_loss": 3.9434314914047697, + "tokens_seen": 467536896 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043349047141424274, + "loss": 3.0319, + "theoretical_loss": 3.943371268823248, + "tokens_seen": 467602432 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334804413239719, + "loss": 3.095, + "theoretical_loss": 3.9433110570444576, + "tokens_seen": 467667968 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334704112337011, + "loss": 3.2096, + "theoretical_loss": 3.943250856064947, + "tokens_seen": 467733504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334603811434303, + "loss": 3.0726, + "theoretical_loss": 3.9431906658812657, + "tokens_seen": 467799040 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334503510531595, + "loss": 3.1975, + "theoretical_loss": 3.9431304864899666, + "tokens_seen": 467864576 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043344032096288864, + "loss": 3.1816, + "theoretical_loss": 3.943070317887603, + "tokens_seen": 467930112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334302908726179, + "loss": 3.0188, + "theoretical_loss": 3.94301016007073, + "tokens_seen": 467995648 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433420260782347, + "loss": 3.3186, + "theoretical_loss": 3.9429500130359045, + "tokens_seen": 468061184 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043341023069207624, + "loss": 3.094, + "theoretical_loss": 3.9428898767796845, + "tokens_seen": 468126720 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004334002006018054, + "loss": 3.1864, + "theoretical_loss": 3.94282975129863, + "tokens_seen": 468192256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333901705115346, + "loss": 3.1964, + "theoretical_loss": 3.9427696365893024, + "tokens_seen": 468257792 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333801404212638, + "loss": 3.1646, + "theoretical_loss": 3.942709532648265, + "tokens_seen": 468323328 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433370110330993, + "loss": 3.0684, + "theoretical_loss": 3.9426494394720812, + "tokens_seen": 468388864 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043336008024072215, + "loss": 3.1638, + "theoretical_loss": 3.942589357057319, + "tokens_seen": 468454400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333500501504514, + "loss": 3.0682, + "theoretical_loss": 3.9425292854005454, + "tokens_seen": 468519936 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 769781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0408291816711426, + "objective/train/theoretical_loss": 3.942484238715831, + "objective/train/tokens_used": 489029088, + "theoretical_loss": 3.942484238715831, + "tokens_seen": 468569088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333400200601805, + "loss": 3.0948, + "theoretical_loss": 3.9424692244983293, + "tokens_seen": 468585472 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043332998996990974, + "loss": 3.1851, + "theoretical_loss": 3.942409174347242, + "tokens_seen": 468651008 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333199598796389, + "loss": 3.169, + "theoretical_loss": 3.9423491349438557, + "tokens_seen": 468716544 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004333099297893681, + "loss": 3.219, + "theoretical_loss": 3.942289106284745, + "tokens_seen": 468782080 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004332998996990973, + "loss": 3.1358, + "theoretical_loss": 3.942229088366485, + "tokens_seen": 468847616 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043328986960882647, + "loss": 3.0407, + "theoretical_loss": 3.9421690811856527, + "tokens_seen": 468913152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043327983951855565, + "loss": 3.0933, + "theoretical_loss": 3.942109084738828, + "tokens_seen": 468978688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004332698094282849, + "loss": 3.1528, + "theoretical_loss": 3.9420490990225896, + "tokens_seen": 469044224 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433259779338014, + "loss": 3.1818, + "theoretical_loss": 3.9419891240335208, + "tokens_seen": 469109760 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043324974924774325, + "loss": 3.2339, + "theoretical_loss": 3.941929159768204, + "tokens_seen": 469175296 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043323971915747243, + "loss": 3.1396, + "theoretical_loss": 3.941869206223225, + "tokens_seen": 469240832 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004332296890672016, + "loss": 3.2735, + "theoretical_loss": 3.94180926339517, + "tokens_seen": 469306368 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004332196589769308, + "loss": 2.99, + "theoretical_loss": 3.9417493312806275, + "tokens_seen": 469371904 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043320962888665997, + "loss": 3.1056, + "theoretical_loss": 3.9416894098761865, + "tokens_seen": 469437440 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043319959879638915, + "loss": 3.03, + "theoretical_loss": 3.9416294991784393, + "tokens_seen": 469502976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004331895687061184, + "loss": 3.0625, + "theoretical_loss": 3.941569599183978, + "tokens_seen": 469568512 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043317953861584757, + "loss": 3.1628, + "theoretical_loss": 3.941509709889397, + "tokens_seen": 469634048 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043316950852557675, + "loss": 3.0413, + "theoretical_loss": 3.9414498312912927, + "tokens_seen": 469699584 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043315947843530593, + "loss": 3.1867, + "theoretical_loss": 3.9413899633862624, + "tokens_seen": 469765120 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004331494483450351, + "loss": 2.9758, + "theoretical_loss": 3.9413301061709047, + "tokens_seen": 469830656 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043313941825476435, + "loss": 2.9883, + "theoretical_loss": 3.941270259641821, + "tokens_seen": 469896192 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004331293881644935, + "loss": 3.0905, + "theoretical_loss": 3.9412104237956127, + "tokens_seen": 469961728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004331193580742227, + "loss": 2.9672, + "theoretical_loss": 3.9411505986288846, + "tokens_seen": 470027264 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043310932798395184, + "loss": 3.0611, + "theoretical_loss": 3.9410907841382405, + "tokens_seen": 470092800 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043309929789368107, + "loss": 3.1635, + "theoretical_loss": 3.941030980320289, + "tokens_seen": 470158336 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 772206, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2430427074432373, + "objective/train/theoretical_loss": 3.9409861344587385, + "objective/train/tokens_used": 490667488, + "theoretical_loss": 3.9409861344587385, + "tokens_seen": 470207488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043308926780341025, + "loss": 3.2207, + "theoretical_loss": 3.940971187171637, + "tokens_seen": 470223872 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043307923771313943, + "loss": 3.266, + "theoretical_loss": 3.940911404688895, + "tokens_seen": 470289408 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004330692076228686, + "loss": 2.9104, + "theoretical_loss": 3.940851632868675, + "tokens_seen": 470354944 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043305917753259785, + "loss": 3.2956, + "theoretical_loss": 3.940791871707589, + "tokens_seen": 470420480 + }, + { + "epoch": 1.04, + "learning_rate": 0.000433049147442327, + "loss": 3.0473, + "theoretical_loss": 3.9407321212022524, + "tokens_seen": 470486016 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004330391173520562, + "loss": 3.1757, + "theoretical_loss": 3.9406723813492808, + "tokens_seen": 470551552 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043302908726178534, + "loss": 3.0941, + "theoretical_loss": 3.940612652145292, + "tokens_seen": 470617088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004330190571715146, + "loss": 3.1229, + "theoretical_loss": 3.9405529335869063, + "tokens_seen": 470682624 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043300902708124376, + "loss": 3.0279, + "theoretical_loss": 3.9404932256707426, + "tokens_seen": 470748160 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043299899699097294, + "loss": 3.1667, + "theoretical_loss": 3.9404335283934246, + "tokens_seen": 470813696 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329889669007021, + "loss": 3.158, + "theoretical_loss": 3.9403738417515757, + "tokens_seen": 470879232 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329789368104313, + "loss": 3.1981, + "theoretical_loss": 3.940314165741821, + "tokens_seen": 470944768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329689067201605, + "loss": 2.9643, + "theoretical_loss": 3.940254500360788, + "tokens_seen": 471010304 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329588766298897, + "loss": 3.0841, + "theoretical_loss": 3.9401948456051046, + "tokens_seen": 471075840 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043294884653961884, + "loss": 3.0498, + "theoretical_loss": 3.940135201471401, + "tokens_seen": 471141376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329388164493481, + "loss": 3.1539, + "theoretical_loss": 3.940075567956309, + "tokens_seen": 471206912 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329287863590772, + "loss": 3.0275, + "theoretical_loss": 3.9400159450564614, + "tokens_seen": 471272448 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043291875626880644, + "loss": 3.1802, + "theoretical_loss": 3.939956332768493, + "tokens_seen": 471337984 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004329087261785356, + "loss": 3.1957, + "theoretical_loss": 3.939896731089041, + "tokens_seen": 471403520 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328986960882648, + "loss": 3.2497, + "theoretical_loss": 3.939837140014741, + "tokens_seen": 471469056 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432888665997994, + "loss": 3.2239, + "theoretical_loss": 3.939777559542233, + "tokens_seen": 471534592 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328786359077232, + "loss": 3.0257, + "theoretical_loss": 3.939717989668158, + "tokens_seen": 471600128 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043286860581745235, + "loss": 3.2169, + "theoretical_loss": 3.9396584303891586, + "tokens_seen": 471665664 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328585757271816, + "loss": 3.0285, + "theoretical_loss": 3.939598881701878, + "tokens_seen": 471731200 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328485456369107, + "loss": 3.0827, + "theoretical_loss": 3.939539343602962, + "tokens_seen": 471796736 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 774963, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9794204235076904, + "objective/train/theoretical_loss": 3.939494696975372, + "objective/train/tokens_used": 492305888, + "theoretical_loss": 3.939494696975372, + "tokens_seen": 471845888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043283851554663994, + "loss": 3.0123, + "theoretical_loss": 3.939479816089057, + "tokens_seen": 471862272 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328284854563691, + "loss": 3.0038, + "theoretical_loss": 3.9394202991568124, + "tokens_seen": 471927808 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328184553660983, + "loss": 3.1075, + "theoretical_loss": 3.9393607928028764, + "tokens_seen": 471993344 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004328084252758275, + "loss": 3.0773, + "theoretical_loss": 3.9393012970239023, + "tokens_seen": 472058880 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043279839518555667, + "loss": 3.0178, + "theoretical_loss": 3.939241811816542, + "tokens_seen": 472124416 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043278836509528585, + "loss": 3.109, + "theoretical_loss": 3.93918233717745, + "tokens_seen": 472189952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004327783350050151, + "loss": 3.0764, + "theoretical_loss": 3.939122873103283, + "tokens_seen": 472255488 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004327683049147442, + "loss": 2.9527, + "theoretical_loss": 3.9390634195906973, + "tokens_seen": 472321024 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043275827482447345, + "loss": 3.0077, + "theoretical_loss": 3.9390039766363536, + "tokens_seen": 472386560 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043274824473420263, + "loss": 2.9471, + "theoretical_loss": 3.9389445442369113, + "tokens_seen": 472452096 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004327382146439318, + "loss": 2.9818, + "theoretical_loss": 3.938885122389033, + "tokens_seen": 472517632 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432728184553661, + "loss": 3.1504, + "theoretical_loss": 3.938825711089382, + "tokens_seen": 472583168 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043271815446339017, + "loss": 3.1098, + "theoretical_loss": 3.938766310334624, + "tokens_seen": 472648704 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043270812437311935, + "loss": 3.1007, + "theoretical_loss": 3.9387069201214246, + "tokens_seen": 472714240 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326980942828486, + "loss": 3.1002, + "theoretical_loss": 3.9386475404464534, + "tokens_seen": 472779776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326880641925777, + "loss": 3.0734, + "theoretical_loss": 3.938588171306379, + "tokens_seen": 472845312 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043267803410230695, + "loss": 2.9987, + "theoretical_loss": 3.938528812697873, + "tokens_seen": 472910848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326680040120361, + "loss": 3.1647, + "theoretical_loss": 3.938469464617608, + "tokens_seen": 472976384 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326579739217653, + "loss": 3.0175, + "theoretical_loss": 3.938410127062258, + "tokens_seen": 473041920 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326479438314945, + "loss": 3.0582, + "theoretical_loss": 3.9383508000284997, + "tokens_seen": 473107456 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326379137412237, + "loss": 2.9429, + "theoretical_loss": 3.938291483513009, + "tokens_seen": 473172992 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043262788365095286, + "loss": 2.981, + "theoretical_loss": 3.938232177512466, + "tokens_seen": 473238528 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043261785356068204, + "loss": 3.0132, + "theoretical_loss": 3.93817288202355, + "tokens_seen": 473304064 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004326078234704112, + "loss": 3.1105, + "theoretical_loss": 3.9381135970429426, + "tokens_seen": 473369600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043259779338014045, + "loss": 3.1185, + "theoretical_loss": 3.938054322567328, + "tokens_seen": 473435136 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 777810, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1461081504821777, + "objective/train/theoretical_loss": 3.9380098736025237, + "objective/train/tokens_used": 493944288, + "theoretical_loss": 3.9380098736025237, + "tokens_seen": 473484288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004325877632898696, + "loss": 3.042, + "theoretical_loss": 3.9379950585933905, + "tokens_seen": 473500672 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004325777331995988, + "loss": 3.006, + "theoretical_loss": 3.9379358051178164, + "tokens_seen": 473566208 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432567703109328, + "loss": 3.0798, + "theoretical_loss": 3.937876562137294, + "tokens_seen": 473631744 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004325576730190572, + "loss": 3.0596, + "theoretical_loss": 3.9378173296485115, + "tokens_seen": 473697280 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043254764292878636, + "loss": 3.1396, + "theoretical_loss": 3.9377581076481603, + "tokens_seen": 473762816 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043253761283851554, + "loss": 3.1611, + "theoretical_loss": 3.9376988961329333, + "tokens_seen": 473828352 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004325275827482447, + "loss": 3.1311, + "theoretical_loss": 3.937639695099523, + "tokens_seen": 473893888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043251755265797396, + "loss": 3.1988, + "theoretical_loss": 3.937580504544626, + "tokens_seen": 473959424 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004325075225677031, + "loss": 3.1165, + "theoretical_loss": 3.9375213244649383, + "tokens_seen": 474024960 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004324974924774323, + "loss": 3.1341, + "theoretical_loss": 3.9374621548571582, + "tokens_seen": 474090496 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043248746238716145, + "loss": 3.0819, + "theoretical_loss": 3.937402995717986, + "tokens_seen": 474156032 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004324774322968907, + "loss": 3.0635, + "theoretical_loss": 3.937343847044123, + "tokens_seen": 474221568 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043246740220661986, + "loss": 3.0965, + "theoretical_loss": 3.937284708832271, + "tokens_seen": 474287104 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043245737211634904, + "loss": 3.04, + "theoretical_loss": 3.9372255810791357, + "tokens_seen": 474352640 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004324473420260782, + "loss": 3.2905, + "theoretical_loss": 3.9371664637814217, + "tokens_seen": 474418176 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004324373119358074, + "loss": 3.1888, + "theoretical_loss": 3.9371073569358366, + "tokens_seen": 474483712 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043242728184553664, + "loss": 3.1338, + "theoretical_loss": 3.9370482605390897, + "tokens_seen": 474549248 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004324172517552658, + "loss": 3.0821, + "theoretical_loss": 3.9369891745878904, + "tokens_seen": 474614784 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432407221664995, + "loss": 3.0655, + "theoretical_loss": 3.9369300990789515, + "tokens_seen": 474680320 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323971915747242, + "loss": 3.1936, + "theoretical_loss": 3.936871034008985, + "tokens_seen": 474745856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323871614844534, + "loss": 3.0186, + "theoretical_loss": 3.9368119793747063, + "tokens_seen": 474811392 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043237713139418255, + "loss": 3.0778, + "theoretical_loss": 3.936752935172832, + "tokens_seen": 474876928 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323671013039118, + "loss": 3.1556, + "theoretical_loss": 3.9366939014000786, + "tokens_seen": 474942464 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323570712136409, + "loss": 3.1658, + "theoretical_loss": 3.936634878053166, + "tokens_seen": 475008000 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043234704112337014, + "loss": 3.2034, + "theoretical_loss": 3.936575865128815, + "tokens_seen": 475073536 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 780678, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3475329875946045, + "objective/train/theoretical_loss": 3.936531612273386, + "objective/train/tokens_used": 495582688, + "theoretical_loss": 3.936531612273386, + "tokens_seen": 475122688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323370110330993, + "loss": 3.1818, + "theoretical_loss": 3.9365168626237477, + "tokens_seen": 475139072 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323269809428285, + "loss": 3.0325, + "theoretical_loss": 3.9364578705346878, + "tokens_seen": 475204608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004323169508525577, + "loss": 2.9718, + "theoretical_loss": 3.93639888885836, + "tokens_seen": 475270144 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043230692076228687, + "loss": 3.1274, + "theoretical_loss": 3.936339917591491, + "tokens_seen": 475335680 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043229689067201605, + "loss": 3.1798, + "theoretical_loss": 3.9362809567308092, + "tokens_seen": 475401216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004322868605817453, + "loss": 2.8611, + "theoretical_loss": 3.9362220062730437, + "tokens_seen": 475466752 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004322768304914744, + "loss": 3.0027, + "theoretical_loss": 3.9361630662149256, + "tokens_seen": 475532288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043226680040120365, + "loss": 2.7894, + "theoretical_loss": 3.9361041365531877, + "tokens_seen": 475597824 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043225677031093283, + "loss": 3.1337, + "theoretical_loss": 3.9360452172845637, + "tokens_seen": 475663360 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432246740220662, + "loss": 2.9619, + "theoretical_loss": 3.9359863084057896, + "tokens_seen": 475728896 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004322367101303912, + "loss": 3.096, + "theoretical_loss": 3.9359274099136012, + "tokens_seen": 475794432 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043222668004012037, + "loss": 3.0578, + "theoretical_loss": 3.9358685218047382, + "tokens_seen": 475859968 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043221664994984955, + "loss": 3.2078, + "theoretical_loss": 3.93580964407594, + "tokens_seen": 475925504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004322066198595788, + "loss": 3.0603, + "theoretical_loss": 3.935750776723947, + "tokens_seen": 475991040 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321965897693079, + "loss": 3.0512, + "theoretical_loss": 3.935691919745504, + "tokens_seen": 476056576 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043218655967903715, + "loss": 3.2588, + "theoretical_loss": 3.9356330731373537, + "tokens_seen": 476122112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321765295887663, + "loss": 3.175, + "theoretical_loss": 3.935574236896242, + "tokens_seen": 476187648 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321664994984955, + "loss": 3.0544, + "theoretical_loss": 3.9355154110189163, + "tokens_seen": 476253184 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321564694082247, + "loss": 3.1464, + "theoretical_loss": 3.935456595502126, + "tokens_seen": 476318720 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321464393179539, + "loss": 3.1839, + "theoretical_loss": 3.9353977903426207, + "tokens_seen": 476384256 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043213640922768306, + "loss": 3.0053, + "theoretical_loss": 3.935338995537152, + "tokens_seen": 476449792 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043212637913741224, + "loss": 3.1284, + "theoretical_loss": 3.935280211082473, + "tokens_seen": 476515328 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004321163490471414, + "loss": 3.0133, + "theoretical_loss": 3.935221436975338, + "tokens_seen": 476580864 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043210631895687065, + "loss": 3.077, + "theoretical_loss": 3.9351626732125036, + "tokens_seen": 476646400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320962888665998, + "loss": 2.9485, + "theoretical_loss": 3.935103919790727, + "tokens_seen": 476711936 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 783710, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.4726645946502686, + "objective/train/theoretical_loss": 3.935059861508764, + "objective/train/tokens_used": 497221088, + "theoretical_loss": 3.935059861508764, + "tokens_seen": 476761088 + }, + { + "epoch": 1.04, + "learning_rate": 0.000432086258776329, + "loss": 3.1836, + "theoretical_loss": 3.935045176706767, + "tokens_seen": 476777472 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320762286860582, + "loss": 3.1737, + "theoretical_loss": 3.934986443957385, + "tokens_seen": 476843008 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320661985957874, + "loss": 3.0408, + "theoretical_loss": 3.9349277215393412, + "tokens_seen": 476908544 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043205616850551656, + "loss": 3.0718, + "theoretical_loss": 3.9348690094494, + "tokens_seen": 476974080 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043204613841524574, + "loss": 3.0201, + "theoretical_loss": 3.934810307684326, + "tokens_seen": 477039616 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320361083249749, + "loss": 3.0733, + "theoretical_loss": 3.9347516162408858, + "tokens_seen": 477105152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043202607823470416, + "loss": 3.1073, + "theoretical_loss": 3.934692935115846, + "tokens_seen": 477170688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320160481444333, + "loss": 3.1918, + "theoretical_loss": 3.934634264305977, + "tokens_seen": 477236224 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004320060180541625, + "loss": 3.1225, + "theoretical_loss": 3.9345756038080495, + "tokens_seen": 477301760 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043199598796389165, + "loss": 3.1406, + "theoretical_loss": 3.934516953618834, + "tokens_seen": 477367296 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004319859578736209, + "loss": 2.9953, + "theoretical_loss": 3.9344583137351057, + "tokens_seen": 477432832 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043197592778335006, + "loss": 3.0649, + "theoretical_loss": 3.9343996841536386, + "tokens_seen": 477498368 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043196589769307924, + "loss": 3.0539, + "theoretical_loss": 3.934341064871209, + "tokens_seen": 477563904 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004319558676028084, + "loss": 3.0146, + "theoretical_loss": 3.9342824558845955, + "tokens_seen": 477629440 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004319458375125376, + "loss": 3.099, + "theoretical_loss": 3.934223857190578, + "tokens_seen": 477694976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004319358074222668, + "loss": 3.1249, + "theoretical_loss": 3.9341652687859354, + "tokens_seen": 477760512 + }, + { + "epoch": 1.04, + "learning_rate": 0.000431925777331996, + "loss": 3.3014, + "theoretical_loss": 3.934106690667451, + "tokens_seen": 477826048 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043191574724172515, + "loss": 3.0374, + "theoretical_loss": 3.934048122831909, + "tokens_seen": 477891584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004319057171514544, + "loss": 3.1595, + "theoretical_loss": 3.9339895652760934, + "tokens_seen": 477957120 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043189568706118357, + "loss": 3.0205, + "theoretical_loss": 3.9339310179967915, + "tokens_seen": 478022656 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043188565697091275, + "loss": 3.1232, + "theoretical_loss": 3.9338724809907912, + "tokens_seen": 478088192 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043187562688064193, + "loss": 2.9081, + "theoretical_loss": 3.9338139542548816, + "tokens_seen": 478153728 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004318655967903711, + "loss": 3.0758, + "theoretical_loss": 3.933755437785854, + "tokens_seen": 478219264 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004318555667001003, + "loss": 3.0957, + "theoretical_loss": 3.9336969315805006, + "tokens_seen": 478284800 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004318455366098295, + "loss": 3.1013, + "theoretical_loss": 3.933638435635615, + "tokens_seen": 478350336 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 785886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.047959566116333, + "objective/train/theoretical_loss": 3.9335945704084554, + "objective/train/tokens_used": 498859488, + "theoretical_loss": 3.9335945704084554, + "tokens_seen": 478399488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043183550651955865, + "loss": 3.1534, + "theoretical_loss": 3.933579949947993, + "tokens_seen": 478415872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004318254764292879, + "loss": 3.0707, + "theoretical_loss": 3.9335214745144307, + "tokens_seen": 478481408 + }, + { + "epoch": 1.04, + "learning_rate": 0.000431815446339017, + "loss": 2.9389, + "theoretical_loss": 3.933463009331726, + "tokens_seen": 478546944 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043180541624874625, + "loss": 3.0333, + "theoretical_loss": 3.9334045543966796, + "tokens_seen": 478612480 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043179538615847543, + "loss": 3.0577, + "theoretical_loss": 3.9333461097060916, + "tokens_seen": 478678016 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004317853560682046, + "loss": 3.1546, + "theoretical_loss": 3.9332876752567643, + "tokens_seen": 478743552 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004317753259779338, + "loss": 3.1144, + "theoretical_loss": 3.933229251045501, + "tokens_seen": 478809088 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043176529588766303, + "loss": 3.2305, + "theoretical_loss": 3.9331708370691087, + "tokens_seen": 478874624 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043175526579739215, + "loss": 3.1091, + "theoretical_loss": 3.9331124333243928, + "tokens_seen": 478940160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004317452357071214, + "loss": 3.1075, + "theoretical_loss": 3.933054039808162, + "tokens_seen": 479005696 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004317352056168505, + "loss": 3.0587, + "theoretical_loss": 3.9329956565172255, + "tokens_seen": 479071232 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043172517552657975, + "loss": 2.9739, + "theoretical_loss": 3.9329372834483944, + "tokens_seen": 479136768 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043171514543630893, + "loss": 3.1694, + "theoretical_loss": 3.9328789205984815, + "tokens_seen": 479202304 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004317051153460381, + "loss": 3.1073, + "theoretical_loss": 3.9328205679643, + "tokens_seen": 479267840 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316950852557673, + "loss": 3.1007, + "theoretical_loss": 3.9327622255426666, + "tokens_seen": 479333376 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316850551654965, + "loss": 3.1688, + "theoretical_loss": 3.9327038933303964, + "tokens_seen": 479398912 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316750250752257, + "loss": 3.1369, + "theoretical_loss": 3.932645571324308, + "tokens_seen": 479464448 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316649949849549, + "loss": 2.9778, + "theoretical_loss": 3.9325872595212217, + "tokens_seen": 479529984 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316549648946841, + "loss": 3.1172, + "theoretical_loss": 3.932528957917958, + "tokens_seen": 479595520 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043164493480441326, + "loss": 3.0572, + "theoretical_loss": 3.932470666511339, + "tokens_seen": 479661056 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043163490471414244, + "loss": 3.2025, + "theoretical_loss": 3.932412385298189, + "tokens_seen": 479726592 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004316248746238716, + "loss": 3.1654, + "theoretical_loss": 3.9323541142753333, + "tokens_seen": 479792128 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043161484453360085, + "loss": 3.0233, + "theoretical_loss": 3.932295853439599, + "tokens_seen": 479857664 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043160481444333, + "loss": 3.1733, + "theoretical_loss": 3.9322376027878128, + "tokens_seen": 479923200 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315947843530592, + "loss": 3.0387, + "theoretical_loss": 3.9321793623168055, + "tokens_seen": 479988736 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 788662, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1168766021728516, + "objective/train/theoretical_loss": 3.93213568864278, + "objective/train/tokens_used": 500497888, + "theoretical_loss": 3.93213568864278, + "tokens_seen": 480037888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315847542627884, + "loss": 3.1642, + "theoretical_loss": 3.9321211320234077, + "tokens_seen": 480054272 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315747241725176, + "loss": 3.1574, + "theoretical_loss": 3.9320629119044526, + "tokens_seen": 480119808 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043156469408224676, + "loss": 3.1736, + "theoretical_loss": 3.9320047019567728, + "tokens_seen": 480185344 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043155466399197594, + "loss": 3.1393, + "theoretical_loss": 3.9319465021772033, + "tokens_seen": 480250880 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315446339017051, + "loss": 3.143, + "theoretical_loss": 3.931888312562582, + "tokens_seen": 480316416 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043153460381143436, + "loss": 3.1499, + "theoretical_loss": 3.931830133109746, + "tokens_seen": 480381952 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315245737211635, + "loss": 2.9706, + "theoretical_loss": 3.9317719638155353, + "tokens_seen": 480447488 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004315145436308927, + "loss": 3.1538, + "theoretical_loss": 3.931713804676791, + "tokens_seen": 480513024 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043150451354062185, + "loss": 3.2051, + "theoretical_loss": 3.9316556556903546, + "tokens_seen": 480578560 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004314944834503511, + "loss": 3.0838, + "theoretical_loss": 3.9315975168530706, + "tokens_seen": 480644096 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043148445336008026, + "loss": 3.0589, + "theoretical_loss": 3.9315393881617835, + "tokens_seen": 480709632 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043147442326980944, + "loss": 3.1544, + "theoretical_loss": 3.9314812696133394, + "tokens_seen": 480775168 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004314643931795386, + "loss": 3.1178, + "theoretical_loss": 3.9314231612045876, + "tokens_seen": 480840704 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004314543630892678, + "loss": 3.047, + "theoretical_loss": 3.9313650629323766, + "tokens_seen": 480906240 + }, + { + "epoch": 1.04, + "learning_rate": 0.000431444332998997, + "loss": 3.0582, + "theoretical_loss": 3.931306974793557, + "tokens_seen": 480971776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004314343029087262, + "loss": 3.0496, + "theoretical_loss": 3.9312488967849815, + "tokens_seen": 481037312 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043142427281845535, + "loss": 3.2129, + "theoretical_loss": 3.931190828903504, + "tokens_seen": 481102848 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004314142427281846, + "loss": 3.078, + "theoretical_loss": 3.931132771145978, + "tokens_seen": 481168384 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043140421263791377, + "loss": 3.0211, + "theoretical_loss": 3.931074723509261, + "tokens_seen": 481233920 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043139418254764295, + "loss": 3.171, + "theoretical_loss": 3.931016685990211, + "tokens_seen": 481299456 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043138415245737213, + "loss": 2.9881, + "theoretical_loss": 3.930958658585687, + "tokens_seen": 481364992 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004313741223671013, + "loss": 3.0824, + "theoretical_loss": 3.9309006412925482, + "tokens_seen": 481430528 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004313640922768305, + "loss": 3.0406, + "theoretical_loss": 3.930842634107658, + "tokens_seen": 481496064 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004313540621865597, + "loss": 3.2304, + "theoretical_loss": 3.9307846370278803, + "tokens_seen": 481561600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043134403209628885, + "loss": 3.0492, + "theoretical_loss": 3.9307266500500786, + "tokens_seen": 481627136 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 791380, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.039585590362549, + "objective/train/theoretical_loss": 3.9306831664442643, + "objective/train/tokens_used": 502136288, + "theoretical_loss": 3.9306831664442643, + "tokens_seen": 481676288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004313340020060181, + "loss": 3.1, + "theoretical_loss": 3.9306686731711196, + "tokens_seen": 481692672 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004313239719157472, + "loss": 3.079, + "theoretical_loss": 3.9306107063878715, + "tokens_seen": 481758208 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043131394182547645, + "loss": 3.1404, + "theoretical_loss": 3.9305527496972017, + "tokens_seen": 481823744 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043130391173520563, + "loss": 3.1685, + "theoretical_loss": 3.930494803095982, + "tokens_seen": 481889280 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004312938816449348, + "loss": 2.9856, + "theoretical_loss": 3.9304368665810845, + "tokens_seen": 481954816 + }, + { + "epoch": 1.04, + "learning_rate": 0.000431283851554664, + "loss": 3.0934, + "theoretical_loss": 3.9303789401493807, + "tokens_seen": 482020352 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043127382146439323, + "loss": 3.1469, + "theoretical_loss": 3.9303210237977466, + "tokens_seen": 482085888 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043126379137412236, + "loss": 3.1508, + "theoretical_loss": 3.930263117523057, + "tokens_seen": 482151424 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004312537612838516, + "loss": 3.1907, + "theoretical_loss": 3.9302052213221907, + "tokens_seen": 482216960 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004312437311935807, + "loss": 2.9841, + "theoretical_loss": 3.9301473351920255, + "tokens_seen": 482282496 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043123370110330995, + "loss": 3.1433, + "theoretical_loss": 3.9300894591294413, + "tokens_seen": 482348032 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043122367101303913, + "loss": 3.2604, + "theoretical_loss": 3.9300315931313206, + "tokens_seen": 482413568 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004312136409227683, + "loss": 3.0146, + "theoretical_loss": 3.9299737371945453, + "tokens_seen": 482479104 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004312036108324975, + "loss": 3.0738, + "theoretical_loss": 3.9299158913160004, + "tokens_seen": 482544640 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311935807422267, + "loss": 3.1308, + "theoretical_loss": 3.9298580554925717, + "tokens_seen": 482610176 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043118355065195586, + "loss": 2.9519, + "theoretical_loss": 3.9298002297211454, + "tokens_seen": 482675712 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311735205616851, + "loss": 3.0089, + "theoretical_loss": 3.9297424139986106, + "tokens_seen": 482741248 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311634904714142, + "loss": 3.2172, + "theoretical_loss": 3.9296846083218573, + "tokens_seen": 482806784 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043115346038114346, + "loss": 3.0583, + "theoretical_loss": 3.9296268126877765, + "tokens_seen": 482872320 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311434302908726, + "loss": 3.152, + "theoretical_loss": 3.9295690270932604, + "tokens_seen": 482937856 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311334002006018, + "loss": 3.0147, + "theoretical_loss": 3.929511251535204, + "tokens_seen": 483003392 + }, + { + "epoch": 1.04, + "learning_rate": 0.000431123370110331, + "loss": 2.9952, + "theoretical_loss": 3.9294534860105017, + "tokens_seen": 483068928 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004311133400200602, + "loss": 2.9802, + "theoretical_loss": 3.9293957305160507, + "tokens_seen": 483134464 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043110330992978936, + "loss": 3.1134, + "theoretical_loss": 3.9293379850487495, + "tokens_seen": 483200000 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310932798395186, + "loss": 3.1227, + "theoretical_loss": 3.929280249605497, + "tokens_seen": 483265536 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 794373, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8497860431671143, + "objective/train/theoretical_loss": 3.9292369545994754, + "objective/train/tokens_used": 503774688, + "theoretical_loss": 3.9292369545994754, + "tokens_seen": 483314688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310832497492477, + "loss": 3.1548, + "theoretical_loss": 3.929222524183195, + "tokens_seen": 483331072 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043107321965897696, + "loss": 3.042, + "theoretical_loss": 3.9291648087787445, + "tokens_seen": 483396608 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310631895687061, + "loss": 2.9228, + "theoretical_loss": 3.9291071033890494, + "tokens_seen": 483462144 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310531594784353, + "loss": 2.9634, + "theoretical_loss": 3.929049408011016, + "tokens_seen": 483527680 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310431293881645, + "loss": 3.0875, + "theoretical_loss": 3.928991722641549, + "tokens_seen": 483593216 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310330992978937, + "loss": 3.0679, + "theoretical_loss": 3.9289340472775582, + "tokens_seen": 483658752 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043102306920762286, + "loss": 2.9628, + "theoretical_loss": 3.928876381915951, + "tokens_seen": 483724288 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043101303911735205, + "loss": 3.0897, + "theoretical_loss": 3.928818726553639, + "tokens_seen": 483789824 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004310030090270812, + "loss": 3.2173, + "theoretical_loss": 3.9287610811875333, + "tokens_seen": 483855360 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043099297893681046, + "loss": 3.1677, + "theoretical_loss": 3.9287034458145476, + "tokens_seen": 483920896 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004309829488465396, + "loss": 3.1459, + "theoretical_loss": 3.928645820431597, + "tokens_seen": 483986432 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004309729187562688, + "loss": 3.1697, + "theoretical_loss": 3.928588205035596, + "tokens_seen": 484051968 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043096288866599795, + "loss": 2.9369, + "theoretical_loss": 3.928530599623464, + "tokens_seen": 484117504 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004309528585757272, + "loss": 3.14, + "theoretical_loss": 3.9284730041921185, + "tokens_seen": 484183040 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043094282848545637, + "loss": 3.0601, + "theoretical_loss": 3.92841541873848, + "tokens_seen": 484248576 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043093279839518555, + "loss": 2.9191, + "theoretical_loss": 3.92835784325947, + "tokens_seen": 484314112 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004309227683049148, + "loss": 3.0761, + "theoretical_loss": 3.928300277752011, + "tokens_seen": 484379648 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043091273821464397, + "loss": 3.0833, + "theoretical_loss": 3.928242722213028, + "tokens_seen": 484445184 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043090270812437315, + "loss": 3.1764, + "theoretical_loss": 3.9281851766394453, + "tokens_seen": 484510720 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043089267803410233, + "loss": 3.1214, + "theoretical_loss": 3.9281276410281913, + "tokens_seen": 484576256 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004308826479438315, + "loss": 2.8353, + "theoretical_loss": 3.928070115376194, + "tokens_seen": 484641792 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004308726178535607, + "loss": 3.0785, + "theoretical_loss": 3.9280125996803825, + "tokens_seen": 484707328 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004308625877632899, + "loss": 3.1645, + "theoretical_loss": 3.9279550939376877, + "tokens_seen": 484772864 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043085255767301905, + "loss": 3.2155, + "theoretical_loss": 3.9278975981450426, + "tokens_seen": 484838400 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004308425275827483, + "loss": 3.2544, + "theoretical_loss": 3.927840112299381, + "tokens_seen": 484903936 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 795638, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7247087955474854, + "objective/train/theoretical_loss": 3.927797004440999, + "objective/train/tokens_used": 505413088, + "theoretical_loss": 3.927797004440999, + "tokens_seen": 484953088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004308324974924774, + "loss": 3.0136, + "theoretical_loss": 3.9277826363976382, + "tokens_seen": 484969472 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043082246740220665, + "loss": 3.1647, + "theoretical_loss": 3.9277251704367497, + "tokens_seen": 485035008 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043081243731193583, + "loss": 3.2384, + "theoretical_loss": 3.927667714413654, + "tokens_seen": 485100544 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430802407221665, + "loss": 2.9979, + "theoretical_loss": 3.9276102683252905, + "tokens_seen": 485166080 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307923771313942, + "loss": 3.0352, + "theoretical_loss": 3.9275528321685993, + "tokens_seen": 485231616 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043078234704112343, + "loss": 2.998, + "theoretical_loss": 3.9274954059405225, + "tokens_seen": 485297152 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043077231695085256, + "loss": 3.0194, + "theoretical_loss": 3.927437989638004, + "tokens_seen": 485362688 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307622868605818, + "loss": 2.9299, + "theoretical_loss": 3.927380583257987, + "tokens_seen": 485428224 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307522567703109, + "loss": 3.1945, + "theoretical_loss": 3.9273231867974183, + "tokens_seen": 485493760 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043074222668004015, + "loss": 2.8815, + "theoretical_loss": 3.9272658002532452, + "tokens_seen": 485559296 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043073219658976933, + "loss": 3.1099, + "theoretical_loss": 3.9272084236224165, + "tokens_seen": 485624832 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307221664994985, + "loss": 3.0577, + "theoretical_loss": 3.9271510569018817, + "tokens_seen": 485690368 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307121364092277, + "loss": 3.1596, + "theoretical_loss": 3.9270937000885935, + "tokens_seen": 485755904 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004307021063189569, + "loss": 2.9445, + "theoretical_loss": 3.9270363531795027, + "tokens_seen": 485821440 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043069207622868606, + "loss": 3.1571, + "theoretical_loss": 3.9269790161715648, + "tokens_seen": 485886976 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306820461384153, + "loss": 2.8519, + "theoretical_loss": 3.926921689061735, + "tokens_seen": 485952512 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306720160481444, + "loss": 3.2401, + "theoretical_loss": 3.9268643718469693, + "tokens_seen": 486018048 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043066198595787366, + "loss": 3.0247, + "theoretical_loss": 3.926807064524226, + "tokens_seen": 486083584 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306519558676028, + "loss": 3.0239, + "theoretical_loss": 3.926749767090466, + "tokens_seen": 486149120 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430641925777332, + "loss": 3.0947, + "theoretical_loss": 3.9266924795426483, + "tokens_seen": 486214656 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306318956870612, + "loss": 3.1743, + "theoretical_loss": 3.9266352018777355, + "tokens_seen": 486280192 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306218655967904, + "loss": 3.0861, + "theoretical_loss": 3.926577934092691, + "tokens_seen": 486345728 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043061183550651956, + "loss": 3.1806, + "theoretical_loss": 3.926520676184481, + "tokens_seen": 486411264 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004306018054162488, + "loss": 2.9468, + "theoretical_loss": 3.92646342815007, + "tokens_seen": 486476800 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305917753259779, + "loss": 3.0781, + "theoretical_loss": 3.9264061899864258, + "tokens_seen": 486542336 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 798621, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3106675148010254, + "objective/train/theoretical_loss": 3.9263632678395606, + "objective/train/tokens_used": 507051488, + "theoretical_loss": 3.9263632678395606, + "tokens_seen": 486591488 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043058174523570716, + "loss": 3.0708, + "theoretical_loss": 3.9263489616905183, + "tokens_seen": 486607872 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305717151454363, + "loss": 3.0081, + "theoretical_loss": 3.9262917432593163, + "tokens_seen": 486673408 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305616850551655, + "loss": 3.2136, + "theoretical_loss": 3.9262345346897924, + "tokens_seen": 486738944 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305516549648947, + "loss": 3.0546, + "theoretical_loss": 3.9261773359789185, + "tokens_seen": 486804480 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305416248746239, + "loss": 3.2195, + "theoretical_loss": 3.9261201471236697, + "tokens_seen": 486870016 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043053159478435306, + "loss": 3.2155, + "theoretical_loss": 3.9260629681210215, + "tokens_seen": 486935552 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043052156469408225, + "loss": 3.1978, + "theoretical_loss": 3.9260057989679504, + "tokens_seen": 487001088 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004305115346038114, + "loss": 3.1257, + "theoretical_loss": 3.925948639661434, + "tokens_seen": 487066624 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043050150451354066, + "loss": 2.9821, + "theoretical_loss": 3.925891490198453, + "tokens_seen": 487132160 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004304914744232698, + "loss": 2.8507, + "theoretical_loss": 3.925834350575988, + "tokens_seen": 487197696 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430481444332999, + "loss": 3.0456, + "theoretical_loss": 3.92577722079102, + "tokens_seen": 487263232 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043047141424272815, + "loss": 3.103, + "theoretical_loss": 3.9257201008405342, + "tokens_seen": 487328768 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004304613841524574, + "loss": 3.1053, + "theoretical_loss": 3.9256629907215146, + "tokens_seen": 487394304 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043045135406218657, + "loss": 2.9016, + "theoretical_loss": 3.9256058904309477, + "tokens_seen": 487459840 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043044132397191575, + "loss": 3.1049, + "theoretical_loss": 3.9255487999658207, + "tokens_seen": 487525376 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043043129388164493, + "loss": 3.1802, + "theoretical_loss": 3.9254917193231225, + "tokens_seen": 487590912 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043042126379137417, + "loss": 2.9478, + "theoretical_loss": 3.9254346484998432, + "tokens_seen": 487656448 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004304112337011033, + "loss": 3.0425, + "theoretical_loss": 3.9253775874929753, + "tokens_seen": 487721984 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043040120361083253, + "loss": 2.9459, + "theoretical_loss": 3.92532053629951, + "tokens_seen": 487787520 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043039117352056165, + "loss": 3.1606, + "theoretical_loss": 3.9252634949164427, + "tokens_seen": 487853056 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004303811434302909, + "loss": 3.1162, + "theoretical_loss": 3.9252064633407686, + "tokens_seen": 487918592 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043037111334002007, + "loss": 3.0762, + "theoretical_loss": 3.9251494415694843, + "tokens_seen": 487984128 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043036108324974925, + "loss": 3.111, + "theoretical_loss": 3.9250924295995877, + "tokens_seen": 488049664 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043035105315947843, + "loss": 3.0144, + "theoretical_loss": 3.925035427428079, + "tokens_seen": 488115200 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004303410230692076, + "loss": 3.2038, + "theoretical_loss": 3.9249784350519583, + "tokens_seen": 488180736 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 801255, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.028470754623413, + "objective/train/theoretical_loss": 3.924935697196288, + "objective/train/tokens_used": 508689888, + "theoretical_loss": 3.924935697196288, + "tokens_seen": 488229888 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004303309929789368, + "loss": 3.0127, + "theoretical_loss": 3.9249214524682277, + "tokens_seen": 488246272 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043032096288866603, + "loss": 2.9755, + "theoretical_loss": 3.924864479673891, + "tokens_seen": 488311808 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043031093279839516, + "loss": 3.0948, + "theoretical_loss": 3.924807516665953, + "tokens_seen": 488377344 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004303009027081244, + "loss": 3.0326, + "theoretical_loss": 3.9247505634414193, + "tokens_seen": 488442880 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004302908726178535, + "loss": 3.1051, + "theoretical_loss": 3.9246936199972975, + "tokens_seen": 488508416 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043028084252758276, + "loss": 3.0533, + "theoretical_loss": 3.9246366863305964, + "tokens_seen": 488573952 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043027081243731194, + "loss": 3.3134, + "theoretical_loss": 3.924579762438326, + "tokens_seen": 488639488 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004302607823470411, + "loss": 3.1065, + "theoretical_loss": 3.9245228483174968, + "tokens_seen": 488705024 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004302507522567703, + "loss": 3.1414, + "theoretical_loss": 3.9244659439651226, + "tokens_seen": 488770560 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043024072216649953, + "loss": 3.0519, + "theoretical_loss": 3.924409049378217, + "tokens_seen": 488836096 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043023069207622866, + "loss": 3.0045, + "theoretical_loss": 3.9243521645537944, + "tokens_seen": 488901632 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004302206619859579, + "loss": 2.975, + "theoretical_loss": 3.9242952894888723, + "tokens_seen": 488967168 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430210631895687, + "loss": 3.1066, + "theoretical_loss": 3.9242384241804684, + "tokens_seen": 489032704 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043020060180541626, + "loss": 3.2003, + "theoretical_loss": 3.924181568625602, + "tokens_seen": 489098240 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043019057171514544, + "loss": 3.171, + "theoretical_loss": 3.9241247228212934, + "tokens_seen": 489163776 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004301805416248746, + "loss": 3.0997, + "theoretical_loss": 3.924067886764564, + "tokens_seen": 489229312 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043017051153460386, + "loss": 3.0101, + "theoretical_loss": 3.924011060452438, + "tokens_seen": 489294848 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430160481444333, + "loss": 3.1218, + "theoretical_loss": 3.923954243881939, + "tokens_seen": 489360384 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004301504513540622, + "loss": 3.251, + "theoretical_loss": 3.9238974370500923, + "tokens_seen": 489425920 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004301404212637914, + "loss": 3.0984, + "theoretical_loss": 3.923840639953926, + "tokens_seen": 489491456 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004301303911735206, + "loss": 3.0004, + "theoretical_loss": 3.923783852590468, + "tokens_seen": 489556992 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043012036108324976, + "loss": 3.1007, + "theoretical_loss": 3.923727074956748, + "tokens_seen": 489622528 + }, + { + "epoch": 1.04, + "learning_rate": 0.000430110330992979, + "loss": 3.0945, + "theoretical_loss": 3.9236703070497962, + "tokens_seen": 489688064 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004301003009027081, + "loss": 2.9783, + "theoretical_loss": 3.9236135488666464, + "tokens_seen": 489753600 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043009027081243736, + "loss": 3.1709, + "theoretical_loss": 3.923556800404331, + "tokens_seen": 489819136 + }, + { + "epoch": 1.04, + "objective/train/docs_used": 804229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2461323738098145, + "objective/train/theoretical_loss": 3.923514245435108, + "objective/train/tokens_used": 510328288, + "theoretical_loss": 3.923514245435108, + "tokens_seen": 489868288 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004300802407221665, + "loss": 3.1634, + "theoretical_loss": 3.923500061659885, + "tokens_seen": 489884672 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004300702106318957, + "loss": 3.0673, + "theoretical_loss": 3.9234433326303444, + "tokens_seen": 489950208 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004300601805416249, + "loss": 2.8394, + "theoretical_loss": 3.9233866133127475, + "tokens_seen": 490015744 + }, + { + "epoch": 1.04, + "learning_rate": 0.0004300501504513541, + "loss": 3.1372, + "theoretical_loss": 3.9233299037041327, + "tokens_seen": 490081280 + }, + { + "epoch": 1.04, + "learning_rate": 0.00043004012036108326, + "loss": 3.1465, + "theoretical_loss": 3.923273203801539, + "tokens_seen": 490146816 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043003009027081245, + "loss": 3.2554, + "theoretical_loss": 3.9232165136020094, + "tokens_seen": 490212352 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043002006018054163, + "loss": 3.1383, + "theoretical_loss": 3.9231598331025856, + "tokens_seen": 490277888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043001003009027086, + "loss": 2.9262, + "theoretical_loss": 3.923103162300312, + "tokens_seen": 490343424 + }, + { + "epoch": 1.05, + "learning_rate": 0.00043, + "loss": 2.9404, + "theoretical_loss": 3.9230465011922333, + "tokens_seen": 490408960 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004299899699097292, + "loss": 2.8928, + "theoretical_loss": 3.922989849775396, + "tokens_seen": 490474496 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042997993981945835, + "loss": 3.1921, + "theoretical_loss": 3.9229332080468486, + "tokens_seen": 490540032 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004299699097291876, + "loss": 3.1682, + "theoretical_loss": 3.92287657600364, + "tokens_seen": 490605568 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042995987963891677, + "loss": 3.1156, + "theoretical_loss": 3.9228199536428203, + "tokens_seen": 490671104 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042994984954864595, + "loss": 3.0684, + "theoretical_loss": 3.922763340961442, + "tokens_seen": 490736640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042993981945837513, + "loss": 2.9642, + "theoretical_loss": 3.922706737956557, + "tokens_seen": 490802176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042992978936810437, + "loss": 3.0399, + "theoretical_loss": 3.92265014462522, + "tokens_seen": 490867712 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004299197592778335, + "loss": 3.057, + "theoretical_loss": 3.922593560964487, + "tokens_seen": 490933248 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042990972918756273, + "loss": 2.8352, + "theoretical_loss": 3.922536986971415, + "tokens_seen": 490998784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042989969909729185, + "loss": 2.904, + "theoretical_loss": 3.9224804226430607, + "tokens_seen": 491064320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004298896690070211, + "loss": 3.0227, + "theoretical_loss": 3.922423867976485, + "tokens_seen": 491129856 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042987963891675027, + "loss": 3.0973, + "theoretical_loss": 3.9223673229687486, + "tokens_seen": 491195392 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042986960882647945, + "loss": 3.0095, + "theoretical_loss": 3.9223107876169125, + "tokens_seen": 491260928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042985957873620863, + "loss": 2.8206, + "theoretical_loss": 3.9222542619180416, + "tokens_seen": 491326464 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004298495486459378, + "loss": 3.1709, + "theoretical_loss": 3.922197745869199, + "tokens_seen": 491392000 + }, + { + "epoch": 1.05, + "learning_rate": 0.000429839518555667, + "loss": 3.1893, + "theoretical_loss": 3.922141239467451, + "tokens_seen": 491457536 + }, + { + "debugging/Self-BLEU-5": 0.6250507482234287, + "debugging/distinct-1-grams": 0.7131323044095581, + "debugging/distinct-2-grams": 0.9276440854000618, + "debugging/entropy-1-grams": 6.323579560455782, + "debugging/entropy-2-grams": 7.544112459587328, + "debugging/length": 577.0769230769231, + "debugging/num_segments": 26, + "epoch": 1.05, + "objective/train/docs_used": 806881, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9723455905914307, + "objective/train/theoretical_loss": 3.922098865995282, + "objective/train/tokens_used": 511966688, + "theoretical_loss": 3.922098865995282, + "tokens_seen": 491506688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042982948846539623, + "loss": 3.1316, + "theoretical_loss": 3.922084742709865, + "tokens_seen": 491523072 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042981945837512536, + "loss": 3.0543, + "theoretical_loss": 3.922028255593509, + "tokens_seen": 491588608 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004298094282848546, + "loss": 3.0158, + "theoretical_loss": 3.921971778115454, + "tokens_seen": 491654144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004297993981945837, + "loss": 3.1159, + "theoretical_loss": 3.9219153102727695, + "tokens_seen": 491719680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042978936810431296, + "loss": 3.101, + "theoretical_loss": 3.9218588520625284, + "tokens_seen": 491785216 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042977933801404214, + "loss": 2.9939, + "theoretical_loss": 3.9218024034818044, + "tokens_seen": 491850752 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004297693079237713, + "loss": 3.1721, + "theoretical_loss": 3.921745964527672, + "tokens_seen": 491916288 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004297592778335005, + "loss": 3.0444, + "theoretical_loss": 3.921689535197207, + "tokens_seen": 491981824 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042974924774322973, + "loss": 3.0457, + "theoretical_loss": 3.9216331154874884, + "tokens_seen": 492047360 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042973921765295886, + "loss": 3.173, + "theoretical_loss": 3.9215767053955934, + "tokens_seen": 492112896 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004297291875626881, + "loss": 3.1564, + "theoretical_loss": 3.9215203049186016, + "tokens_seen": 492178432 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004297191574724172, + "loss": 3.1195, + "theoretical_loss": 3.9214639140535956, + "tokens_seen": 492243968 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042970912738214646, + "loss": 3.2048, + "theoretical_loss": 3.9214075327976574, + "tokens_seen": 492309504 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042969909729187564, + "loss": 3.1387, + "theoretical_loss": 3.92135116114787, + "tokens_seen": 492375040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296890672016048, + "loss": 3.2367, + "theoretical_loss": 3.92129479910132, + "tokens_seen": 492440576 + }, + { + "epoch": 1.05, + "learning_rate": 0.000429679037111334, + "loss": 3.0912, + "theoretical_loss": 3.921238446655092, + "tokens_seen": 492506112 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296690070210632, + "loss": 3.0179, + "theoretical_loss": 3.9211821038062746, + "tokens_seen": 492571648 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042965897693079236, + "loss": 3.1384, + "theoretical_loss": 3.9211257705519564, + "tokens_seen": 492637184 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296489468405216, + "loss": 3.0886, + "theoretical_loss": 3.9210694468892275, + "tokens_seen": 492702720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296389167502507, + "loss": 3.0793, + "theoretical_loss": 3.9210131328151796, + "tokens_seen": 492768256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042962888665997996, + "loss": 3.0415, + "theoretical_loss": 3.920956828326905, + "tokens_seen": 492833792 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296188565697091, + "loss": 3.0902, + "theoretical_loss": 3.9209005334214986, + "tokens_seen": 492899328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004296088264794383, + "loss": 3.22, + "theoretical_loss": 3.9208442480960537, + "tokens_seen": 492964864 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004295987963891675, + "loss": 3.0616, + "theoretical_loss": 3.9207879723476684, + "tokens_seen": 493030400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004295887662988967, + "loss": 3.0443, + "theoretical_loss": 3.9207317061734397, + "tokens_seen": 493095936 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 809707, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8048243522644043, + "objective/train/theoretical_loss": 3.920689512824064, + "objective/train/tokens_used": 513605088, + "theoretical_loss": 3.920689512824064, + "tokens_seen": 493145088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042957873620862587, + "loss": 2.9779, + "theoretical_loss": 3.920675449570467, + "tokens_seen": 493161472 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004295687061183551, + "loss": 3.0747, + "theoretical_loss": 3.9206192025358506, + "tokens_seen": 493227008 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042955867602808423, + "loss": 2.9091, + "theoretical_loss": 3.9205629650666918, + "tokens_seen": 493292544 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042954864593781347, + "loss": 3.1497, + "theoretical_loss": 3.9205067371600935, + "tokens_seen": 493358080 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004295386158475426, + "loss": 3.1051, + "theoretical_loss": 3.9204505188131593, + "tokens_seen": 493423616 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042952858575727183, + "loss": 3.0968, + "theoretical_loss": 3.9203943100229957, + "tokens_seen": 493489152 + }, + { + "epoch": 1.05, + "learning_rate": 0.000429518555667001, + "loss": 3.1627, + "theoretical_loss": 3.9203381107867075, + "tokens_seen": 493554688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004295085255767302, + "loss": 3.2209, + "theoretical_loss": 3.920281921101404, + "tokens_seen": 493620224 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042949849548645937, + "loss": 3.1442, + "theoretical_loss": 3.920225740964194, + "tokens_seen": 493685760 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042948846539618855, + "loss": 3.1387, + "theoretical_loss": 3.9201695703721873, + "tokens_seen": 493751296 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042947843530591773, + "loss": 3.0976, + "theoretical_loss": 3.9201134093224965, + "tokens_seen": 493816832 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042946840521564697, + "loss": 2.9889, + "theoretical_loss": 3.920057257812233, + "tokens_seen": 493882368 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004294583751253761, + "loss": 3.2728, + "theoretical_loss": 3.920001115838512, + "tokens_seen": 493947904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042944834503510533, + "loss": 3.2682, + "theoretical_loss": 3.9199449833984494, + "tokens_seen": 494013440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004294383149448345, + "loss": 3.1874, + "theoretical_loss": 3.919888860489161, + "tokens_seen": 494078976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004294282848545637, + "loss": 3.0038, + "theoretical_loss": 3.9198327471077645, + "tokens_seen": 494144512 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042941825476429293, + "loss": 3.166, + "theoretical_loss": 3.9197766432513794, + "tokens_seen": 494210048 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042940822467402206, + "loss": 3.097, + "theoretical_loss": 3.919720548917126, + "tokens_seen": 494275584 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004293981945837513, + "loss": 2.9764, + "theoretical_loss": 3.9196644641021265, + "tokens_seen": 494341120 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042938816449348047, + "loss": 3.0478, + "theoretical_loss": 3.919608388803503, + "tokens_seen": 494406656 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042937813440320965, + "loss": 3.0709, + "theoretical_loss": 3.9195523230183804, + "tokens_seen": 494472192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042936810431293883, + "loss": 3.0596, + "theoretical_loss": 3.919496266743883, + "tokens_seen": 494537728 + }, + { + "epoch": 1.05, + "learning_rate": 0.000429358074222668, + "loss": 3.0263, + "theoretical_loss": 3.9194402199771385, + "tokens_seen": 494603264 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004293480441323972, + "loss": 2.8825, + "theoretical_loss": 3.919384182715275, + "tokens_seen": 494668800 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042933801404212643, + "loss": 3.0925, + "theoretical_loss": 3.9193281549554206, + "tokens_seen": 494734336 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 811217, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1370749473571777, + "objective/train/theoretical_loss": 3.9192861403694974, + "objective/train/tokens_used": 515243488, + "theoretical_loss": 3.9192861403694974, + "tokens_seen": 494783488 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042932798395185556, + "loss": 2.8671, + "theoretical_loss": 3.9192721366947065, + "tokens_seen": 494799872 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004293179538615848, + "loss": 3.0611, + "theoretical_loss": 3.9192161279302637, + "tokens_seen": 494865408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004293079237713139, + "loss": 3.0009, + "theoretical_loss": 3.9191601286592253, + "tokens_seen": 494930944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042929789368104316, + "loss": 3.148, + "theoretical_loss": 3.919104138878726, + "tokens_seen": 494996480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042928786359077234, + "loss": 3.0776, + "theoretical_loss": 3.919048158585901, + "tokens_seen": 495062016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004292778335005015, + "loss": 3.1705, + "theoretical_loss": 3.918992187777886, + "tokens_seen": 495127552 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004292678034102307, + "loss": 3.2219, + "theoretical_loss": 3.9189362264518204, + "tokens_seen": 495193088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042925777331995993, + "loss": 2.8221, + "theoretical_loss": 3.9188802746048417, + "tokens_seen": 495258624 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042924774322968906, + "loss": 3.1986, + "theoretical_loss": 3.9188243322340917, + "tokens_seen": 495324160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004292377131394183, + "loss": 2.9918, + "theoretical_loss": 3.9187683993367113, + "tokens_seen": 495389696 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004292276830491474, + "loss": 3.0655, + "theoretical_loss": 3.9187124759098433, + "tokens_seen": 495455232 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042921765295887666, + "loss": 3.0604, + "theoretical_loss": 3.9186565619506313, + "tokens_seen": 495520768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042920762286860584, + "loss": 3.1441, + "theoretical_loss": 3.9186006574562215, + "tokens_seen": 495586304 + }, + { + "epoch": 1.05, + "learning_rate": 0.000429197592778335, + "loss": 3.0255, + "theoretical_loss": 3.9185447624237604, + "tokens_seen": 495651840 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291875626880642, + "loss": 3.2025, + "theoretical_loss": 3.9184888768503954, + "tokens_seen": 495717376 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291775325977934, + "loss": 3.0633, + "theoretical_loss": 3.918433000733275, + "tokens_seen": 495782912 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042916750250752256, + "loss": 3.0655, + "theoretical_loss": 3.918377134069551, + "tokens_seen": 495848448 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291574724172518, + "loss": 3.0472, + "theoretical_loss": 3.9183212768563735, + "tokens_seen": 495913984 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291474423269809, + "loss": 3.127, + "theoretical_loss": 3.9182654290908956, + "tokens_seen": 495979520 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042913741223671016, + "loss": 3.1227, + "theoretical_loss": 3.9182095907702714, + "tokens_seen": 496045056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291273821464393, + "loss": 3.1443, + "theoretical_loss": 3.918153761891656, + "tokens_seen": 496110592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291173520561685, + "loss": 2.931, + "theoretical_loss": 3.9180979424522064, + "tokens_seen": 496176128 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004291073219658977, + "loss": 3.1792, + "theoretical_loss": 3.9180421324490795, + "tokens_seen": 496241664 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004290972918756269, + "loss": 3.114, + "theoretical_loss": 3.917986331879434, + "tokens_seen": 496307200 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042908726178535607, + "loss": 3.1256, + "theoretical_loss": 3.9179305407404312, + "tokens_seen": 496372736 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 815081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.147005558013916, + "objective/train/theoretical_loss": 3.91788870357333, + "objective/train/tokens_used": 516881888, + "theoretical_loss": 3.91788870357333, + "tokens_seen": 496421888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004290772316950853, + "loss": 3.2891, + "theoretical_loss": 3.9178747590292313, + "tokens_seen": 496438272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042906720160481443, + "loss": 3.1328, + "theoretical_loss": 3.917818986742997, + "tokens_seen": 496503808 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042905717151454367, + "loss": 2.9878, + "theoretical_loss": 3.917763223878893, + "tokens_seen": 496569344 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004290471414242728, + "loss": 2.9743, + "theoretical_loss": 3.9177074704340833, + "tokens_seen": 496634880 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042903711133400203, + "loss": 3.2028, + "theoretical_loss": 3.917651726405735, + "tokens_seen": 496700416 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004290270812437312, + "loss": 3.1811, + "theoretical_loss": 3.917595991791015, + "tokens_seen": 496765952 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004290170511534604, + "loss": 3.1528, + "theoretical_loss": 3.9175402665870926, + "tokens_seen": 496831488 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042900702106318957, + "loss": 3.1129, + "theoretical_loss": 3.9174845507911367, + "tokens_seen": 496897024 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042899699097291875, + "loss": 3.1476, + "theoretical_loss": 3.9174288444003196, + "tokens_seen": 496962560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042898696088264793, + "loss": 3.0995, + "theoretical_loss": 3.917373147411813, + "tokens_seen": 497028096 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042897693079237717, + "loss": 3.0915, + "theoretical_loss": 3.917317459822791, + "tokens_seen": 497093632 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004289669007021063, + "loss": 3.1035, + "theoretical_loss": 3.917261781630428, + "tokens_seen": 497159168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042895687061183553, + "loss": 3.0279, + "theoretical_loss": 3.9172061128319, + "tokens_seen": 497224704 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004289468405215647, + "loss": 3.0463, + "theoretical_loss": 3.9171504534243846, + "tokens_seen": 497290240 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004289368104312939, + "loss": 2.9696, + "theoretical_loss": 3.9170948034050603, + "tokens_seen": 497355776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004289267803410231, + "loss": 3.0356, + "theoretical_loss": 3.9170391627711068, + "tokens_seen": 497421312 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042891675025075226, + "loss": 3.2209, + "theoretical_loss": 3.916983531519705, + "tokens_seen": 497486848 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042890672016048144, + "loss": 3.1402, + "theoretical_loss": 3.916927909648037, + "tokens_seen": 497552384 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042889669007021067, + "loss": 3.1548, + "theoretical_loss": 3.9168722971532857, + "tokens_seen": 497617920 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004288866599799398, + "loss": 3.0856, + "theoretical_loss": 3.9168166940326365, + "tokens_seen": 497683456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042887662988966903, + "loss": 3.0453, + "theoretical_loss": 3.916761100283275, + "tokens_seen": 497748992 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042886659979939816, + "loss": 3.1351, + "theoretical_loss": 3.916705515902388, + "tokens_seen": 497814528 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004288565697091274, + "loss": 3.1323, + "theoretical_loss": 3.9166499408871633, + "tokens_seen": 497880064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004288465396188566, + "loss": 3.1688, + "theoretical_loss": 3.9165943752347916, + "tokens_seen": 497945600 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042883650952858576, + "loss": 3.1067, + "theoretical_loss": 3.9165388189424624, + "tokens_seen": 498011136 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 816504, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.196155071258545, + "objective/train/theoretical_loss": 3.9164971578640544, + "objective/train/tokens_used": 518520288, + "theoretical_loss": 3.9164971578640544, + "tokens_seen": 498060288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042882647943831494, + "loss": 3.1969, + "theoretical_loss": 3.916483272007368, + "tokens_seen": 498076672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004288164493480441, + "loss": 3.2264, + "theoretical_loss": 3.916427734426702, + "tokens_seen": 498142208 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004288064192577733, + "loss": 3.02, + "theoretical_loss": 3.916372206197657, + "tokens_seen": 498207744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042879638916750254, + "loss": 3.0739, + "theoretical_loss": 3.9163166873174307, + "tokens_seen": 498273280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042878635907723166, + "loss": 3.1701, + "theoretical_loss": 3.916261177783219, + "tokens_seen": 498338816 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004287763289869609, + "loss": 2.9895, + "theoretical_loss": 3.9162056775922185, + "tokens_seen": 498404352 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004287662988966901, + "loss": 3.0661, + "theoretical_loss": 3.91615018674163, + "tokens_seen": 498469888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042875626880641926, + "loss": 3.0896, + "theoretical_loss": 3.916094705228653, + "tokens_seen": 498535424 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042874623871614844, + "loss": 3.0645, + "theoretical_loss": 3.9160392330504896, + "tokens_seen": 498600960 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004287362086258776, + "loss": 3.1685, + "theoretical_loss": 3.9159837702043427, + "tokens_seen": 498666496 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004287261785356068, + "loss": 3.0131, + "theoretical_loss": 3.915928316687415, + "tokens_seen": 498732032 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042871614844533604, + "loss": 3.0925, + "theoretical_loss": 3.9158728724969127, + "tokens_seen": 498797568 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042870611835506517, + "loss": 3.2817, + "theoretical_loss": 3.915817437630042, + "tokens_seen": 498863104 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286960882647944, + "loss": 3.1747, + "theoretical_loss": 3.915762012084011, + "tokens_seen": 498928640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042868605817452353, + "loss": 3.1052, + "theoretical_loss": 3.9157065958560273, + "tokens_seen": 498994176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042867602808425276, + "loss": 3.2073, + "theoretical_loss": 3.9156511889433014, + "tokens_seen": 499059712 + }, + { + "epoch": 1.05, + "learning_rate": 0.000428665997993982, + "loss": 3.1055, + "theoretical_loss": 3.915595791343045, + "tokens_seen": 499125248 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286559679037111, + "loss": 2.9425, + "theoretical_loss": 3.9155404030524696, + "tokens_seen": 499190784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042864593781344036, + "loss": 3.2047, + "theoretical_loss": 3.91548502406879, + "tokens_seen": 499256320 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286359077231695, + "loss": 3.0516, + "theoretical_loss": 3.9154296543892197, + "tokens_seen": 499321856 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286258776328987, + "loss": 3.0011, + "theoretical_loss": 3.9153742940109755, + "tokens_seen": 499387392 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286158475426279, + "loss": 3.1932, + "theoretical_loss": 3.9153189429312736, + "tokens_seen": 499452928 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004286058174523571, + "loss": 3.1667, + "theoretical_loss": 3.915263601147333, + "tokens_seen": 499518464 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042859578736208627, + "loss": 3.1432, + "theoretical_loss": 3.9152082686563743, + "tokens_seen": 499584000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004285857572718155, + "loss": 3.2297, + "theoretical_loss": 3.9151529454556164, + "tokens_seen": 499649536 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 819336, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2068521976470947, + "objective/train/theoretical_loss": 3.915111459150072, + "objective/train/tokens_used": 520158688, + "theoretical_loss": 3.915111459150072, + "tokens_seen": 499698688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042857572718154463, + "loss": 3.1164, + "theoretical_loss": 3.9150976315422827, + "tokens_seen": 499715072 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042856569709127387, + "loss": 3.1164, + "theoretical_loss": 3.9150423269135954, + "tokens_seen": 499780608 + }, + { + "epoch": 1.05, + "learning_rate": 0.000428555667001003, + "loss": 3.2037, + "theoretical_loss": 3.9149870315667794, + "tokens_seen": 499846144 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042854563691073223, + "loss": 3.2664, + "theoretical_loss": 3.91493174549906, + "tokens_seen": 499911680 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004285356068204614, + "loss": 3.1479, + "theoretical_loss": 3.9148764687076643, + "tokens_seen": 499977216 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004285255767301906, + "loss": 3.0049, + "theoretical_loss": 3.9148212011898194, + "tokens_seen": 500042752 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042851554663991977, + "loss": 3.1417, + "theoretical_loss": 3.914765942942756, + "tokens_seen": 500108288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042850551654964895, + "loss": 3.0638, + "theoretical_loss": 3.914710693963702, + "tokens_seen": 500173824 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042849548645937813, + "loss": 3.0929, + "theoretical_loss": 3.9146554542498913, + "tokens_seen": 500239360 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042848545636910737, + "loss": 3.2898, + "theoretical_loss": 3.9146002237985553, + "tokens_seen": 500304896 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004284754262788365, + "loss": 3.1476, + "theoretical_loss": 3.9145450026069284, + "tokens_seen": 500370432 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042846539618856573, + "loss": 3.1441, + "theoretical_loss": 3.9144897906722456, + "tokens_seen": 500435968 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004284553660982949, + "loss": 2.9735, + "theoretical_loss": 3.9144345879917424, + "tokens_seen": 500501504 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004284453360080241, + "loss": 3.1873, + "theoretical_loss": 3.9143793945626575, + "tokens_seen": 500567040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004284353059177533, + "loss": 3.1923, + "theoretical_loss": 3.9143242103822287, + "tokens_seen": 500632576 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042842527582748246, + "loss": 3.0812, + "theoretical_loss": 3.9142690354476954, + "tokens_seen": 500698112 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042841524573721164, + "loss": 3.1191, + "theoretical_loss": 3.9142138697563, + "tokens_seen": 500763648 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042840521564694087, + "loss": 3.1231, + "theoretical_loss": 3.9141587133052838, + "tokens_seen": 500829184 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042839518555667, + "loss": 2.9649, + "theoretical_loss": 3.9141035660918897, + "tokens_seen": 500894720 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042838515546639923, + "loss": 3.1742, + "theoretical_loss": 3.9140484281133627, + "tokens_seen": 500960256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042837512537612836, + "loss": 3.1633, + "theoretical_loss": 3.9139932993669495, + "tokens_seen": 501025792 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004283650952858576, + "loss": 3.1627, + "theoretical_loss": 3.913938179849896, + "tokens_seen": 501091328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004283550651955868, + "loss": 3.1532, + "theoretical_loss": 3.91388306955945, + "tokens_seen": 501156864 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042834503510531596, + "loss": 3.022, + "theoretical_loss": 3.9138279684928614, + "tokens_seen": 501222400 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042833500501504514, + "loss": 3.1253, + "theoretical_loss": 3.9137728766473803, + "tokens_seen": 501287936 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 822418, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1141302585601807, + "objective/train/theoretical_loss": 3.9137315638129686, + "objective/train/tokens_used": 521797088, + "theoretical_loss": 3.9137315638129686, + "tokens_seen": 501337088 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004283249749247743, + "loss": 3.0617, + "theoretical_loss": 3.9137177940202594, + "tokens_seen": 501353472 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004283149448345035, + "loss": 3.0852, + "theoretical_loss": 3.91366272060875, + "tokens_seen": 501419008 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042830491474423274, + "loss": 3.1981, + "theoretical_loss": 3.9136076564101066, + "tokens_seen": 501484544 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042829488465396186, + "loss": 3.1411, + "theoretical_loss": 3.9135526014215847, + "tokens_seen": 501550080 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004282848545636911, + "loss": 3.2136, + "theoretical_loss": 3.9134975556404403, + "tokens_seen": 501615616 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004282748244734203, + "loss": 3.2107, + "theoretical_loss": 3.9134425190639313, + "tokens_seen": 501681152 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042826479438314946, + "loss": 3.0602, + "theoretical_loss": 3.913387491689316, + "tokens_seen": 501746688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042825476429287864, + "loss": 3.1874, + "theoretical_loss": 3.9133324735138544, + "tokens_seen": 501812224 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004282447342026078, + "loss": 3.1376, + "theoretical_loss": 3.913277464534808, + "tokens_seen": 501877760 + }, + { + "epoch": 1.05, + "learning_rate": 0.000428234704112337, + "loss": 3.2029, + "theoretical_loss": 3.9132224647494382, + "tokens_seen": 501943296 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042822467402206624, + "loss": 3.0826, + "theoretical_loss": 3.9131674741550087, + "tokens_seen": 502008832 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042821464393179537, + "loss": 3.0498, + "theoretical_loss": 3.9131124927487844, + "tokens_seen": 502074368 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004282046138415246, + "loss": 3.1069, + "theoretical_loss": 3.9130575205280307, + "tokens_seen": 502139904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042819458375125373, + "loss": 3.0809, + "theoretical_loss": 3.9130025574900147, + "tokens_seen": 502205440 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042818455366098296, + "loss": 3.0777, + "theoretical_loss": 3.912947603632004, + "tokens_seen": 502270976 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042817452357071215, + "loss": 3.0323, + "theoretical_loss": 3.912892658951268, + "tokens_seen": 502336512 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004281644934804413, + "loss": 2.9443, + "theoretical_loss": 3.912837723445078, + "tokens_seen": 502402048 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004281544633901705, + "loss": 3.1947, + "theoretical_loss": 3.9127827971107045, + "tokens_seen": 502467584 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004281444332998997, + "loss": 3.1146, + "theoretical_loss": 3.912727879945421, + "tokens_seen": 502533120 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042813440320962887, + "loss": 3.1167, + "theoretical_loss": 3.9126729719465003, + "tokens_seen": 502598656 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004281243731193581, + "loss": 3.3412, + "theoretical_loss": 3.9126180731112186, + "tokens_seen": 502664192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042811434302908723, + "loss": 3.112, + "theoretical_loss": 3.912563183436852, + "tokens_seen": 502729728 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042810431293881647, + "loss": 2.9406, + "theoretical_loss": 3.9125083029206777, + "tokens_seen": 502795264 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042809428284854565, + "loss": 3.0329, + "theoretical_loss": 3.9124534315599737, + "tokens_seen": 502860800 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042808425275827483, + "loss": 3.1227, + "theoretical_loss": 3.912398569352021, + "tokens_seen": 502926336 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 825354, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3944315910339355, + "objective/train/theoretical_loss": 3.9123574287009126, + "objective/train/tokens_used": 523435488, + "theoretical_loss": 3.9123574287009126, + "tokens_seen": 502975488 + }, + { + "epoch": 1.05, + "learning_rate": 0.000428074222668004, + "loss": 3.1932, + "theoretical_loss": 3.912343716294099, + "tokens_seen": 502991872 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004280641925777332, + "loss": 3.1344, + "theoretical_loss": 3.912288872383491, + "tokens_seen": 503057408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004280541624874624, + "loss": 3.035, + "theoretical_loss": 3.9122340376174796, + "tokens_seen": 503122944 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004280441323971916, + "loss": 3.0703, + "theoretical_loss": 3.912179211993349, + "tokens_seen": 503188480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042803410230692074, + "loss": 3.1349, + "theoretical_loss": 3.9121243955083855, + "tokens_seen": 503254016 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042802407221664997, + "loss": 3.1643, + "theoretical_loss": 3.9120695881598753, + "tokens_seen": 503319552 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004280140421263791, + "loss": 3.1643, + "theoretical_loss": 3.912014789945106, + "tokens_seen": 503385088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042800401203610833, + "loss": 3.0295, + "theoretical_loss": 3.9119600008613675, + "tokens_seen": 503450624 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279939819458375, + "loss": 2.9207, + "theoretical_loss": 3.9119052209059486, + "tokens_seen": 503516160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279839518555667, + "loss": 3.1165, + "theoretical_loss": 3.911850450076142, + "tokens_seen": 503581696 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279739217652959, + "loss": 2.9616, + "theoretical_loss": 3.9117956883692395, + "tokens_seen": 503647232 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279638916750251, + "loss": 3.0551, + "theoretical_loss": 3.911740935782535, + "tokens_seen": 503712768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042795386158475424, + "loss": 3.0891, + "theoretical_loss": 3.911686192313323, + "tokens_seen": 503778304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279438314944835, + "loss": 2.8481, + "theoretical_loss": 3.911631457958899, + "tokens_seen": 503843840 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279338014042126, + "loss": 3.2356, + "theoretical_loss": 3.911576732716562, + "tokens_seen": 503909376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042792377131394184, + "loss": 3.06, + "theoretical_loss": 3.911522016583608, + "tokens_seen": 503974912 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042791374122367107, + "loss": 3.2011, + "theoretical_loss": 3.911467309557338, + "tokens_seen": 504040448 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004279037111334002, + "loss": 3.1414, + "theoretical_loss": 3.9114126116350514, + "tokens_seen": 504105984 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042789368104312943, + "loss": 3.2216, + "theoretical_loss": 3.9113579228140507, + "tokens_seen": 504171520 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042788365095285856, + "loss": 3.2078, + "theoretical_loss": 3.9113032430916386, + "tokens_seen": 504237056 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004278736208625878, + "loss": 3.139, + "theoretical_loss": 3.911248572465119, + "tokens_seen": 504302592 + }, + { + "epoch": 1.05, + "learning_rate": 0.000427863590772317, + "loss": 3.0547, + "theoretical_loss": 3.9111939109317966, + "tokens_seen": 504368128 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042785356068204616, + "loss": 2.961, + "theoretical_loss": 3.911139258488979, + "tokens_seen": 504433664 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042784353059177534, + "loss": 3.006, + "theoretical_loss": 3.9110846151339724, + "tokens_seen": 504499200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004278335005015045, + "loss": 2.9341, + "theoretical_loss": 3.9110299808640856, + "tokens_seen": 504564736 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 827406, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1196017265319824, + "objective/train/theoretical_loss": 3.9109890111221626, + "objective/train/tokens_used": 525073888, + "theoretical_loss": 3.9109890111221626, + "tokens_seen": 504613888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004278234704112337, + "loss": 3.1171, + "theoretical_loss": 3.910975355676629, + "tokens_seen": 504630272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042781344032096294, + "loss": 3.266, + "theoretical_loss": 3.910920739568913, + "tokens_seen": 504695808 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042780341023069206, + "loss": 3.1013, + "theoretical_loss": 3.91086613253825, + "tokens_seen": 504761344 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004277933801404213, + "loss": 3.0954, + "theoretical_loss": 3.9108115345819527, + "tokens_seen": 504826880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004277833500501505, + "loss": 3.1133, + "theoretical_loss": 3.9107569456973357, + "tokens_seen": 504892416 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042777331995987966, + "loss": 3.0817, + "theoretical_loss": 3.9107023658817144, + "tokens_seen": 504957952 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042776328986960884, + "loss": 3.1257, + "theoretical_loss": 3.9106477951324057, + "tokens_seen": 505023488 + }, + { + "epoch": 1.05, + "learning_rate": 0.000427753259779338, + "loss": 3.2428, + "theoretical_loss": 3.910593233446727, + "tokens_seen": 505089024 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004277432296890672, + "loss": 3.1907, + "theoretical_loss": 3.910538680821997, + "tokens_seen": 505154560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042773319959879644, + "loss": 3.3279, + "theoretical_loss": 3.910484137255536, + "tokens_seen": 505220096 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042772316950852557, + "loss": 3.1409, + "theoretical_loss": 3.910429602744666, + "tokens_seen": 505285632 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004277131394182548, + "loss": 3.166, + "theoretical_loss": 3.910375077286708, + "tokens_seen": 505351168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042770310932798393, + "loss": 3.1759, + "theoretical_loss": 3.9103205608789864, + "tokens_seen": 505416704 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042769307923771316, + "loss": 3.126, + "theoretical_loss": 3.910266053518825, + "tokens_seen": 505482240 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042768304914744235, + "loss": 2.9463, + "theoretical_loss": 3.91021155520355, + "tokens_seen": 505547776 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042767301905717153, + "loss": 3.0204, + "theoretical_loss": 3.9101570659304885, + "tokens_seen": 505613312 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004276629889669007, + "loss": 3.1151, + "theoretical_loss": 3.9101025856969676, + "tokens_seen": 505678848 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004276529588766299, + "loss": 3.0798, + "theoretical_loss": 3.9100481145003183, + "tokens_seen": 505744384 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042764292878635907, + "loss": 3.0728, + "theoretical_loss": 3.909993652337868, + "tokens_seen": 505809920 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004276328986960883, + "loss": 3.1279, + "theoretical_loss": 3.909939199206951, + "tokens_seen": 505875456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042762286860581743, + "loss": 3.0069, + "theoretical_loss": 3.9098847551048985, + "tokens_seen": 505940992 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042761283851554667, + "loss": 3.2015, + "theoretical_loss": 3.909830320029044, + "tokens_seen": 506006528 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042760280842527585, + "loss": 3.0281, + "theoretical_loss": 3.9097758939767226, + "tokens_seen": 506072064 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042759277833500503, + "loss": 3.0684, + "theoretical_loss": 3.909721476945271, + "tokens_seen": 506137600 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275827482447342, + "loss": 3.1366, + "theoretical_loss": 3.9096670689320243, + "tokens_seen": 506203136 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 830323, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3021249771118164, + "objective/train/theoretical_loss": 3.9096262688386867, + "objective/train/tokens_used": 526712288, + "theoretical_loss": 3.9096262688386867, + "tokens_seen": 506252288 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275727181544634, + "loss": 2.9153, + "theoretical_loss": 3.909612669934323, + "tokens_seen": 506268672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275626880641926, + "loss": 3.1515, + "theoretical_loss": 3.909558279949505, + "tokens_seen": 506334208 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275526579739218, + "loss": 3.137, + "theoretical_loss": 3.9095038989749114, + "tokens_seen": 506399744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042754262788365094, + "loss": 3.1477, + "theoretical_loss": 3.909449527007883, + "tokens_seen": 506465280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042753259779338017, + "loss": 2.9557, + "theoretical_loss": 3.909395164045763, + "tokens_seen": 506530816 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275225677031093, + "loss": 3.1121, + "theoretical_loss": 3.909340810085896, + "tokens_seen": 506596352 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042751253761283853, + "loss": 2.966, + "theoretical_loss": 3.909286465125626, + "tokens_seen": 506661888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004275025075225677, + "loss": 3.2218, + "theoretical_loss": 3.9092321291623, + "tokens_seen": 506727424 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274924774322969, + "loss": 3.0295, + "theoretical_loss": 3.9091778021932635, + "tokens_seen": 506792960 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274824473420261, + "loss": 3.1102, + "theoretical_loss": 3.9091234842158666, + "tokens_seen": 506858496 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274724172517553, + "loss": 3.2479, + "theoretical_loss": 3.909069175227458, + "tokens_seen": 506924032 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042746238716148444, + "loss": 3.2272, + "theoretical_loss": 3.909014875225389, + "tokens_seen": 506989568 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274523570712137, + "loss": 3.0488, + "theoretical_loss": 3.9089605842070103, + "tokens_seen": 507055104 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274423269809428, + "loss": 3.0501, + "theoretical_loss": 3.908906302169675, + "tokens_seen": 507120640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042743229689067204, + "loss": 3.1691, + "theoretical_loss": 3.9088520291107383, + "tokens_seen": 507186176 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274222668004012, + "loss": 3.1173, + "theoretical_loss": 3.9087977650275536, + "tokens_seen": 507251712 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274122367101304, + "loss": 3.0996, + "theoretical_loss": 3.9087435099174783, + "tokens_seen": 507317248 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004274022066198596, + "loss": 3.1654, + "theoretical_loss": 3.9086892637778687, + "tokens_seen": 507382784 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042739217652958876, + "loss": 3.1138, + "theoretical_loss": 3.908635026606084, + "tokens_seen": 507448320 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042738214643931794, + "loss": 3.1055, + "theoretical_loss": 3.908580798399484, + "tokens_seen": 507513856 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004273721163490472, + "loss": 3.1488, + "theoretical_loss": 3.9085265791554287, + "tokens_seen": 507579392 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004273620862587763, + "loss": 3.0311, + "theoretical_loss": 3.90847236887128, + "tokens_seen": 507644928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042735205616850554, + "loss": 3.1637, + "theoretical_loss": 3.9084181675444016, + "tokens_seen": 507710464 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042734202607823467, + "loss": 3.1011, + "theoretical_loss": 3.9083639751721564, + "tokens_seen": 507776000 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004273319959879639, + "loss": 3.1804, + "theoretical_loss": 3.9083097917519103, + "tokens_seen": 507841536 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 832992, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.321255922317505, + "objective/train/theoretical_loss": 3.908269160059891, + "objective/train/tokens_used": 528350688, + "theoretical_loss": 3.908269160059891, + "tokens_seen": 507890688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004273219658976931, + "loss": 3.0994, + "theoretical_loss": 3.9082556172810294, + "tokens_seen": 507907072 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042731193580742226, + "loss": 3.0989, + "theoretical_loss": 3.9082014517568817, + "tokens_seen": 507972608 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042730190571715145, + "loss": 3.175, + "theoretical_loss": 3.9081472951768346, + "tokens_seen": 508038144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004272918756268807, + "loss": 3.0891, + "theoretical_loss": 3.908093147538258, + "tokens_seen": 508103680 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004272818455366098, + "loss": 2.9151, + "theoretical_loss": 3.908039008838523, + "tokens_seen": 508169216 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042727181544633904, + "loss": 3.1409, + "theoretical_loss": 3.907984879075001, + "tokens_seen": 508234752 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042726178535606817, + "loss": 3.2706, + "theoretical_loss": 3.9079307582450657, + "tokens_seen": 508300288 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004272517552657974, + "loss": 3.086, + "theoretical_loss": 3.9078766463460903, + "tokens_seen": 508365824 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004272417251755266, + "loss": 2.8847, + "theoretical_loss": 3.9078225433754508, + "tokens_seen": 508431360 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042723169508525577, + "loss": 3.2195, + "theoretical_loss": 3.907768449330523, + "tokens_seen": 508496896 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042722166499498495, + "loss": 3.1595, + "theoretical_loss": 3.907714364208684, + "tokens_seen": 508562432 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042721163490471413, + "loss": 3.2443, + "theoretical_loss": 3.907660288007313, + "tokens_seen": 508627968 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004272016048144433, + "loss": 3.099, + "theoretical_loss": 3.907606220723789, + "tokens_seen": 508693504 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042719157472417255, + "loss": 2.9306, + "theoretical_loss": 3.907552162355493, + "tokens_seen": 508759040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004271815446339017, + "loss": 2.953, + "theoretical_loss": 3.9074981128998068, + "tokens_seen": 508824576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004271715145436309, + "loss": 3.1683, + "theoretical_loss": 3.9074440723541133, + "tokens_seen": 508890112 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004271614844533601, + "loss": 3.2375, + "theoretical_loss": 3.9073900407157964, + "tokens_seen": 508955648 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042715145436308927, + "loss": 3.1905, + "theoretical_loss": 3.9073360179822414, + "tokens_seen": 509021184 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004271414242728185, + "loss": 3.0311, + "theoretical_loss": 3.9072820041508347, + "tokens_seen": 509086720 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042713139418254763, + "loss": 3.146, + "theoretical_loss": 3.9072279992189634, + "tokens_seen": 509152256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042712136409227687, + "loss": 3.1052, + "theoretical_loss": 3.907174003184016, + "tokens_seen": 509217792 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042711133400200605, + "loss": 3.0927, + "theoretical_loss": 3.9071200160433825, + "tokens_seen": 509283328 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042710130391173523, + "loss": 3.2039, + "theoretical_loss": 3.9070660377944524, + "tokens_seen": 509348864 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270912738214644, + "loss": 2.935, + "theoretical_loss": 3.907012068434619, + "tokens_seen": 509414400 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270812437311936, + "loss": 3.1877, + "theoretical_loss": 3.906958107961273, + "tokens_seen": 509479936 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 835907, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.172219753265381, + "objective/train/theoretical_loss": 3.9069176434364548, + "objective/train/tokens_used": 529989088, + "theoretical_loss": 3.9069176434364548, + "tokens_seen": 509529088 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270712136409228, + "loss": 3.1328, + "theoretical_loss": 3.9069041563718105, + "tokens_seen": 509545472 + }, + { + "epoch": 1.05, + "learning_rate": 0.000427061183550652, + "loss": 3.086, + "theoretical_loss": 3.906850213663626, + "tokens_seen": 509611008 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042705115346038114, + "loss": 3.1241, + "theoretical_loss": 3.906796279834115, + "tokens_seen": 509676544 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042704112337011037, + "loss": 3.3367, + "theoretical_loss": 3.906742354880675, + "tokens_seen": 509742080 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270310932798395, + "loss": 3.1246, + "theoretical_loss": 3.906688438800705, + "tokens_seen": 509807616 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042702106318956873, + "loss": 3.1839, + "theoretical_loss": 3.9066345315916036, + "tokens_seen": 509873152 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270110330992979, + "loss": 3.0617, + "theoretical_loss": 3.9065806332507718, + "tokens_seen": 509938688 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004270010030090271, + "loss": 2.9839, + "theoretical_loss": 3.906526743775611, + "tokens_seen": 510004224 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269909729187563, + "loss": 2.9314, + "theoretical_loss": 3.9064728631635237, + "tokens_seen": 510069760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269809428284855, + "loss": 3.042, + "theoretical_loss": 3.9064189914119143, + "tokens_seen": 510135296 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042697091273821464, + "loss": 2.9985, + "theoretical_loss": 3.9063651285181877, + "tokens_seen": 510200832 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269608826479439, + "loss": 3.102, + "theoretical_loss": 3.9063112744797497, + "tokens_seen": 510266368 + }, + { + "epoch": 1.05, + "learning_rate": 0.000426950852557673, + "loss": 3.0315, + "theoretical_loss": 3.9062574292940067, + "tokens_seen": 510331904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042694082246740224, + "loss": 3.0334, + "theoretical_loss": 3.9062035929583683, + "tokens_seen": 510397440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269307923771314, + "loss": 3.1184, + "theoretical_loss": 3.906149765470243, + "tokens_seen": 510462976 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269207622868606, + "loss": 3.1364, + "theoretical_loss": 3.906095946827041, + "tokens_seen": 510528512 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004269107321965898, + "loss": 3.0032, + "theoretical_loss": 3.9060421370261738, + "tokens_seen": 510594048 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042690070210631896, + "loss": 3.091, + "theoretical_loss": 3.9059883360650547, + "tokens_seen": 510659584 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042689067201604814, + "loss": 3.1334, + "theoretical_loss": 3.905934543941097, + "tokens_seen": 510725120 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004268806419257774, + "loss": 3.1552, + "theoretical_loss": 3.9058807606517147, + "tokens_seen": 510790656 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004268706118355065, + "loss": 3.1225, + "theoretical_loss": 3.9058269861943247, + "tokens_seen": 510856192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042686058174523574, + "loss": 3.1925, + "theoretical_loss": 3.905773220566343, + "tokens_seen": 510921728 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042685055165496487, + "loss": 3.182, + "theoretical_loss": 3.905719463765188, + "tokens_seen": 510987264 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004268405215646941, + "loss": 3.0655, + "theoretical_loss": 3.9056657157882793, + "tokens_seen": 511052800 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004268304914744233, + "loss": 3.1791, + "theoretical_loss": 3.905611976633036, + "tokens_seen": 511118336 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 838646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.035413980484009, + "objective/train/theoretical_loss": 3.9055716780542706, + "objective/train/tokens_used": 531627488, + "theoretical_loss": 3.9055716780542706, + "tokens_seen": 511167488 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042682046138415246, + "loss": 2.9695, + "theoretical_loss": 3.90555824629688, + "tokens_seen": 511183872 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042681043129388165, + "loss": 3.1855, + "theoretical_loss": 3.9055045247772338, + "tokens_seen": 511249408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004268004012036109, + "loss": 3.0497, + "theoretical_loss": 3.9054508120715203, + "tokens_seen": 511314944 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042679037111334, + "loss": 3.0742, + "theoretical_loss": 3.905397108177165, + "tokens_seen": 511380480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042678034102306924, + "loss": 2.8775, + "theoretical_loss": 3.9053434130915923, + "tokens_seen": 511446016 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042677031093279837, + "loss": 3.1956, + "theoretical_loss": 3.905289726812229, + "tokens_seen": 511511552 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004267602808425276, + "loss": 3.0561, + "theoretical_loss": 3.905236049336504, + "tokens_seen": 511577088 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004267502507522568, + "loss": 2.9972, + "theoretical_loss": 3.905182380661845, + "tokens_seen": 511642624 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042674022066198597, + "loss": 3.1907, + "theoretical_loss": 3.905128720785682, + "tokens_seen": 511708160 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042673019057171515, + "loss": 3.1378, + "theoretical_loss": 3.9050750697054464, + "tokens_seen": 511773696 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042672016048144433, + "loss": 3.0493, + "theoretical_loss": 3.90502142741857, + "tokens_seen": 511839232 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004267101303911735, + "loss": 2.9728, + "theoretical_loss": 3.904967793922486, + "tokens_seen": 511904768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042670010030090275, + "loss": 3.0141, + "theoretical_loss": 3.9049141692146287, + "tokens_seen": 511970304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004266900702106319, + "loss": 3.2024, + "theoretical_loss": 3.904860553292434, + "tokens_seen": 512035840 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004266800401203611, + "loss": 2.9925, + "theoretical_loss": 3.9048069461533372, + "tokens_seen": 512101376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042667001003009024, + "loss": 3.0827, + "theoretical_loss": 3.9047533477947765, + "tokens_seen": 512166912 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042665997993981947, + "loss": 3.0859, + "theoretical_loss": 3.9046997582141896, + "tokens_seen": 512232448 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042664994984954865, + "loss": 3.0451, + "theoretical_loss": 3.904646177409017, + "tokens_seen": 512297984 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042663991975927783, + "loss": 3.2424, + "theoretical_loss": 3.904592605376699, + "tokens_seen": 512363520 + }, + { + "epoch": 1.05, + "learning_rate": 0.000426629889669007, + "loss": 3.0217, + "theoretical_loss": 3.9045390421146777, + "tokens_seen": 512429056 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042661985957873625, + "loss": 3.0596, + "theoretical_loss": 3.904485487620396, + "tokens_seen": 512494592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004266098294884654, + "loss": 3.1546, + "theoretical_loss": 3.9044319418912963, + "tokens_seen": 512560128 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004265997993981946, + "loss": 3.0652, + "theoretical_loss": 3.9043784049248256, + "tokens_seen": 512625664 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042658976930792374, + "loss": 3.0295, + "theoretical_loss": 3.9043248767184293, + "tokens_seen": 512691200 + }, + { + "epoch": 1.05, + "learning_rate": 0.000426579739217653, + "loss": 2.9329, + "theoretical_loss": 3.9042713572695535, + "tokens_seen": 512756736 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 840097, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0150365829467773, + "objective/train/theoretical_loss": 3.9042312234284853, + "objective/train/tokens_used": 533265888, + "theoretical_loss": 3.9042312234284853, + "tokens_seen": 512805888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042656970912738216, + "loss": 3.0663, + "theoretical_loss": 3.9042178465756474, + "tokens_seen": 512822272 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042655967903711134, + "loss": 3.1531, + "theoretical_loss": 3.90416434463416, + "tokens_seen": 512887808 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004265496489468405, + "loss": 3.2843, + "theoretical_loss": 3.904110851442542, + "tokens_seen": 512953344 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004265396188565697, + "loss": 3.0433, + "theoretical_loss": 3.904057366998244, + "tokens_seen": 513018880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004265295887662989, + "loss": 3.0694, + "theoretical_loss": 3.9040038912987196, + "tokens_seen": 513084416 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004265195586760281, + "loss": 3.0188, + "theoretical_loss": 3.9039504243414207, + "tokens_seen": 513149952 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042650952858575724, + "loss": 3.2586, + "theoretical_loss": 3.9038969661238037, + "tokens_seen": 513215488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004264994984954865, + "loss": 2.9853, + "theoretical_loss": 3.903843516643323, + "tokens_seen": 513281024 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004264894684052156, + "loss": 3.1401, + "theoretical_loss": 3.9037900758974353, + "tokens_seen": 513346560 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042647943831494484, + "loss": 3.1775, + "theoretical_loss": 3.903736643883599, + "tokens_seen": 513412096 + }, + { + "epoch": 1.05, + "learning_rate": 0.000426469408224674, + "loss": 3.2608, + "theoretical_loss": 3.903683220599273, + "tokens_seen": 513477632 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004264593781344032, + "loss": 2.8949, + "theoretical_loss": 3.9036298060419172, + "tokens_seen": 513543168 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004264493480441324, + "loss": 2.9581, + "theoretical_loss": 3.9035764002089923, + "tokens_seen": 513608704 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004264393179538616, + "loss": 3.0817, + "theoretical_loss": 3.90352300309796, + "tokens_seen": 513674240 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042642928786359075, + "loss": 3.0914, + "theoretical_loss": 3.903469614706284, + "tokens_seen": 513739776 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042641925777332, + "loss": 3.0302, + "theoretical_loss": 3.903416235031428, + "tokens_seen": 513805312 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042640922768304916, + "loss": 3.0923, + "theoretical_loss": 3.903362864070858, + "tokens_seen": 513870848 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042639919759277834, + "loss": 2.9682, + "theoretical_loss": 3.9033095018220396, + "tokens_seen": 513936384 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004263891675025076, + "loss": 3.1248, + "theoretical_loss": 3.9032561482824404, + "tokens_seen": 514001920 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004263791374122367, + "loss": 2.9587, + "theoretical_loss": 3.9032028034495285, + "tokens_seen": 514067456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042636910732196594, + "loss": 3.1557, + "theoretical_loss": 3.9031494673207736, + "tokens_seen": 514132992 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042635907723169507, + "loss": 3.0854, + "theoretical_loss": 3.9030961398936466, + "tokens_seen": 514198528 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004263490471414243, + "loss": 3.0355, + "theoretical_loss": 3.9030428211656183, + "tokens_seen": 514264064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004263390170511535, + "loss": 3.0981, + "theoretical_loss": 3.9029895111341624, + "tokens_seen": 514329600 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042632898696088266, + "loss": 2.9607, + "theoretical_loss": 3.902936209796751, + "tokens_seen": 514395136 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 844055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.251750946044922, + "objective/train/theoretical_loss": 3.9028962394976414, + "objective/train/tokens_used": 534904288, + "theoretical_loss": 3.9028962394976414, + "tokens_seen": 514444288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042631895687061185, + "loss": 3.0864, + "theoretical_loss": 3.9028829171508606, + "tokens_seen": 514460672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004263089267803411, + "loss": 3.0581, + "theoretical_loss": 3.9028296331939654, + "tokens_seen": 514526208 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004262988966900702, + "loss": 3.1028, + "theoretical_loss": 3.9027763579235435, + "tokens_seen": 514591744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042628886659979944, + "loss": 3.1916, + "theoretical_loss": 3.902723091337073, + "tokens_seen": 514657280 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042627883650952857, + "loss": 3.1155, + "theoretical_loss": 3.9026698334320313, + "tokens_seen": 514722816 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004262688064192578, + "loss": 3.3086, + "theoretical_loss": 3.9026165842058997, + "tokens_seen": 514788352 + }, + { + "epoch": 1.05, + "learning_rate": 0.000426258776328987, + "loss": 2.9133, + "theoretical_loss": 3.9025633436561593, + "tokens_seen": 514853888 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042624874623871617, + "loss": 3.0623, + "theoretical_loss": 3.902510111780291, + "tokens_seen": 514919424 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042623871614844535, + "loss": 3.0425, + "theoretical_loss": 3.90245688857578, + "tokens_seen": 514984960 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042622868605817453, + "loss": 3.0515, + "theoretical_loss": 3.9024036740401087, + "tokens_seen": 515050496 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004262186559679037, + "loss": 3.0754, + "theoretical_loss": 3.902350468170763, + "tokens_seen": 515116032 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042620862587763295, + "loss": 3.1309, + "theoretical_loss": 3.9022972709652293, + "tokens_seen": 515181568 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004261985957873621, + "loss": 3.1176, + "theoretical_loss": 3.902244082420995, + "tokens_seen": 515247104 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004261885656970913, + "loss": 3.0583, + "theoretical_loss": 3.9021909025355486, + "tokens_seen": 515312640 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042617853560682044, + "loss": 3.1478, + "theoretical_loss": 3.9021377313063788, + "tokens_seen": 515378176 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042616850551654967, + "loss": 3.1336, + "theoretical_loss": 3.9020845687309773, + "tokens_seen": 515443712 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042615847542627885, + "loss": 3.2132, + "theoretical_loss": 3.902031414806835, + "tokens_seen": 515509248 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042614844533600803, + "loss": 3.0906, + "theoretical_loss": 3.901978269531445, + "tokens_seen": 515574784 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004261384152457372, + "loss": 3.0108, + "theoretical_loss": 3.9019251329022997, + "tokens_seen": 515640320 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042612838515546645, + "loss": 3.0921, + "theoretical_loss": 3.9018720049168953, + "tokens_seen": 515705856 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004261183550651956, + "loss": 3.1924, + "theoretical_loss": 3.9018188855727267, + "tokens_seen": 515771392 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004261083249749248, + "loss": 2.9991, + "theoretical_loss": 3.9017657748672905, + "tokens_seen": 515836928 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042609829488465394, + "loss": 3.2062, + "theoretical_loss": 3.9017126727980855, + "tokens_seen": 515902464 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260882647943832, + "loss": 3.1208, + "theoretical_loss": 3.9016595793626094, + "tokens_seen": 515968000 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042607823470411236, + "loss": 3.0737, + "theoretical_loss": 3.901606494558363, + "tokens_seen": 516033536 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 845414, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.233888864517212, + "objective/train/theoretical_loss": 3.901566686617919, + "objective/train/tokens_used": 536542688, + "theoretical_loss": 3.901566686617919, + "tokens_seen": 516082688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042606820461384154, + "loss": 3.0501, + "theoretical_loss": 3.901553418382847, + "tokens_seen": 516099072 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260581745235707, + "loss": 3.1005, + "theoretical_loss": 3.9015003508335635, + "tokens_seen": 516164608 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260481444332999, + "loss": 2.9561, + "theoretical_loss": 3.9014472919080148, + "tokens_seen": 516230144 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260381143430291, + "loss": 3.1062, + "theoretical_loss": 3.9013942416037057, + "tokens_seen": 516295680 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260280842527583, + "loss": 3.0457, + "theoretical_loss": 3.9013411999181415, + "tokens_seen": 516361216 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042601805416248744, + "loss": 3.0828, + "theoretical_loss": 3.901288166848828, + "tokens_seen": 516426752 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004260080240722167, + "loss": 2.9771, + "theoretical_loss": 3.901235142393272, + "tokens_seen": 516492288 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259979939819458, + "loss": 3.0743, + "theoretical_loss": 3.9011821265489823, + "tokens_seen": 516557824 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042598796389167504, + "loss": 3.1108, + "theoretical_loss": 3.9011291193134685, + "tokens_seen": 516623360 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259779338014042, + "loss": 3.1944, + "theoretical_loss": 3.90107612068424, + "tokens_seen": 516688896 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259679037111334, + "loss": 3.0822, + "theoretical_loss": 3.901023130658808, + "tokens_seen": 516754432 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259578736208626, + "loss": 3.15, + "theoretical_loss": 3.9009701492346864, + "tokens_seen": 516819968 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259478435305918, + "loss": 3.0551, + "theoretical_loss": 3.9009171764093873, + "tokens_seen": 516885504 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042593781344032095, + "loss": 3.0486, + "theoretical_loss": 3.900864212180425, + "tokens_seen": 516951040 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259277833500502, + "loss": 3.1559, + "theoretical_loss": 3.9008112565453157, + "tokens_seen": 517016576 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004259177532597793, + "loss": 3.1643, + "theoretical_loss": 3.900758309501576, + "tokens_seen": 517082112 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042590772316950854, + "loss": 3.117, + "theoretical_loss": 3.9007053710467225, + "tokens_seen": 517147648 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258976930792377, + "loss": 2.9682, + "theoretical_loss": 3.9006524411782744, + "tokens_seen": 517213184 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258876629889669, + "loss": 2.9796, + "theoretical_loss": 3.9005995198937518, + "tokens_seen": 517278720 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258776328986961, + "loss": 2.9852, + "theoretical_loss": 3.900546607190674, + "tokens_seen": 517344256 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042586760280842527, + "loss": 3.1373, + "theoretical_loss": 3.900493703066564, + "tokens_seen": 517409792 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042585757271815445, + "loss": 3.2932, + "theoretical_loss": 3.900440807518944, + "tokens_seen": 517475328 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258475426278837, + "loss": 3.0928, + "theoretical_loss": 3.900387920545337, + "tokens_seen": 517540864 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258375125376128, + "loss": 3.1154, + "theoretical_loss": 3.9003350421432694, + "tokens_seen": 517606400 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042582748244734205, + "loss": 3.1534, + "theoretical_loss": 3.9002821723102654, + "tokens_seen": 517671936 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 848143, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.057004690170288, + "objective/train/theoretical_loss": 3.9002425255574726, + "objective/train/tokens_used": 538181088, + "theoretical_loss": 3.9002425255574726, + "tokens_seen": 517721088 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258174523570712, + "loss": 2.9977, + "theoretical_loss": 3.9002293110438524, + "tokens_seen": 517737472 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004258074222668004, + "loss": 2.933, + "theoretical_loss": 3.900176458341558, + "tokens_seen": 517803008 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257973921765296, + "loss": 3.1235, + "theoretical_loss": 3.9001236142009112, + "tokens_seen": 517868544 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042578736208625877, + "loss": 2.981, + "theoretical_loss": 3.900070778619443, + "tokens_seen": 517934080 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042577733199598795, + "loss": 3.0737, + "theoretical_loss": 3.9000179515946822, + "tokens_seen": 517999616 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257673019057172, + "loss": 3.1079, + "theoretical_loss": 3.8999651331241623, + "tokens_seen": 518065152 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257572718154463, + "loss": 3.1993, + "theoretical_loss": 3.8999123232054154, + "tokens_seen": 518130688 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042574724172517555, + "loss": 3.2229, + "theoretical_loss": 3.8998595218359764, + "tokens_seen": 518196224 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257372116349047, + "loss": 3.0372, + "theoretical_loss": 3.8998067290133793, + "tokens_seen": 518261760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257271815446339, + "loss": 3.0666, + "theoretical_loss": 3.899753944735161, + "tokens_seen": 518327296 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257171514543631, + "loss": 3.155, + "theoretical_loss": 3.8997011689988574, + "tokens_seen": 518392832 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004257071213640923, + "loss": 3.2022, + "theoretical_loss": 3.8996484018020077, + "tokens_seen": 518458368 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042569709127382145, + "loss": 2.968, + "theoretical_loss": 3.899595643142151, + "tokens_seen": 518523904 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042568706118355064, + "loss": 3.0298, + "theoretical_loss": 3.899542893016826, + "tokens_seen": 518589440 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004256770310932798, + "loss": 3.0269, + "theoretical_loss": 3.899490151423575, + "tokens_seen": 518654976 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042566700100300905, + "loss": 3.25, + "theoretical_loss": 3.8994374183599403, + "tokens_seen": 518720512 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042565697091273823, + "loss": 3.0329, + "theoretical_loss": 3.899384693823465, + "tokens_seen": 518786048 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004256469408224674, + "loss": 3.0437, + "theoretical_loss": 3.899331977811693, + "tokens_seen": 518851584 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042563691073219665, + "loss": 3.1329, + "theoretical_loss": 3.899279270322169, + "tokens_seen": 518917120 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004256268806419258, + "loss": 3.1269, + "theoretical_loss": 3.8992265713524397, + "tokens_seen": 518982656 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425616850551655, + "loss": 3.1163, + "theoretical_loss": 3.899173880900053, + "tokens_seen": 519048192 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042560682046138414, + "loss": 3.0061, + "theoretical_loss": 3.8991211989625563, + "tokens_seen": 519113728 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255967903711134, + "loss": 3.1144, + "theoretical_loss": 3.899068525537499, + "tokens_seen": 519179264 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042558676028084256, + "loss": 3.1153, + "theoretical_loss": 3.8990158606224314, + "tokens_seen": 519244800 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042557673019057174, + "loss": 3.0426, + "theoretical_loss": 3.8989632042149047, + "tokens_seen": 519310336 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 851078, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1832823753356934, + "objective/train/theoretical_loss": 3.8989237174908613, + "objective/train/tokens_used": 539819488, + "theoretical_loss": 3.8989237174908613, + "tokens_seen": 519359488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255667001003009, + "loss": 3.2071, + "theoretical_loss": 3.8989105563124715, + "tokens_seen": 519375872 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255566700100301, + "loss": 2.9226, + "theoretical_loss": 3.8988579169126854, + "tokens_seen": 519441408 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255466399197593, + "loss": 2.951, + "theoretical_loss": 3.8988052860130997, + "tokens_seen": 519506944 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255366098294885, + "loss": 3.0489, + "theoretical_loss": 3.898752663611271, + "tokens_seen": 519572480 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042552657973921764, + "loss": 3.0424, + "theoretical_loss": 3.898700049704755, + "tokens_seen": 519638016 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004255165496489469, + "loss": 3.0821, + "theoretical_loss": 3.8986474442911083, + "tokens_seen": 519703552 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425506519558676, + "loss": 3.1868, + "theoretical_loss": 3.8985948473678906, + "tokens_seen": 519769088 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042549648946840524, + "loss": 3.1046, + "theoretical_loss": 3.8985422589326606, + "tokens_seen": 519834624 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254864593781344, + "loss": 3.068, + "theoretical_loss": 3.898489678982979, + "tokens_seen": 519900160 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254764292878636, + "loss": 3.0889, + "theoretical_loss": 3.898437107516407, + "tokens_seen": 519965696 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254663991975928, + "loss": 3.0583, + "theoretical_loss": 3.898384544530507, + "tokens_seen": 520031232 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425456369107322, + "loss": 2.8908, + "theoretical_loss": 3.8983319900228426, + "tokens_seen": 520096768 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042544633901705115, + "loss": 3.1757, + "theoretical_loss": 3.898279443990978, + "tokens_seen": 520162304 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254363089267804, + "loss": 3.0417, + "theoretical_loss": 3.8982269064324786, + "tokens_seen": 520227840 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254262788365095, + "loss": 3.133, + "theoretical_loss": 3.8981743773449113, + "tokens_seen": 520293376 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042541624874623874, + "loss": 3.0436, + "theoretical_loss": 3.8981218567258433, + "tokens_seen": 520358912 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004254062186559679, + "loss": 2.9502, + "theoretical_loss": 3.8980693445728427, + "tokens_seen": 520424448 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253961885656971, + "loss": 3.1573, + "theoretical_loss": 3.8980168408834794, + "tokens_seen": 520489984 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253861584754263, + "loss": 3.1544, + "theoretical_loss": 3.897964345655324, + "tokens_seen": 520555520 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042537612838515547, + "loss": 3.0697, + "theoretical_loss": 3.897911858885947, + "tokens_seen": 520621056 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042536609829488465, + "loss": 3.0633, + "theoretical_loss": 3.897859380572922, + "tokens_seen": 520686592 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253560682046139, + "loss": 3.1526, + "theoretical_loss": 3.897806910713822, + "tokens_seen": 520752128 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425346038114343, + "loss": 3.0427, + "theoretical_loss": 3.897754449306221, + "tokens_seen": 520817664 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042533600802407225, + "loss": 3.157, + "theoretical_loss": 3.897701996347695, + "tokens_seen": 520883200 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253259779338014, + "loss": 3.13, + "theoretical_loss": 3.897649551835821, + "tokens_seen": 520948736 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 854004, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3040003776550293, + "objective/train/theoretical_loss": 3.897610223993573, + "objective/train/tokens_used": 541457888, + "theoretical_loss": 3.897610223993573, + "tokens_seen": 520997888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253159478435306, + "loss": 3.2262, + "theoretical_loss": 3.8975971157681757, + "tokens_seen": 521014272 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004253059177532598, + "loss": 3.1717, + "theoretical_loss": 3.8975446881423372, + "tokens_seen": 521079808 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042529588766298897, + "loss": 3.0773, + "theoretical_loss": 3.8974922689558857, + "tokens_seen": 521145344 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042528585757271815, + "loss": 3.0324, + "theoretical_loss": 3.8974398582064014, + "tokens_seen": 521210880 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252758274824474, + "loss": 2.8661, + "theoretical_loss": 3.897387455891466, + "tokens_seen": 521276416 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252657973921765, + "loss": 3.0843, + "theoretical_loss": 3.897335062008662, + "tokens_seen": 521341952 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042525576730190575, + "loss": 3.1082, + "theoretical_loss": 3.8972826765555726, + "tokens_seen": 521407488 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252457372116349, + "loss": 3.0431, + "theoretical_loss": 3.897230299529782, + "tokens_seen": 521473024 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252357071213641, + "loss": 3.1305, + "theoretical_loss": 3.897177930928876, + "tokens_seen": 521538560 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252256770310933, + "loss": 3.21, + "theoretical_loss": 3.897125570750441, + "tokens_seen": 521604096 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004252156469408225, + "loss": 3.2748, + "theoretical_loss": 3.8970732189920643, + "tokens_seen": 521669632 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042520561685055165, + "loss": 2.9692, + "theoretical_loss": 3.897020875651335, + "tokens_seen": 521735168 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042519558676028084, + "loss": 3.2109, + "theoretical_loss": 3.896968540725841, + "tokens_seen": 521800704 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042518555667001, + "loss": 3.0669, + "theoretical_loss": 3.896916214213175, + "tokens_seen": 521866240 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042517552657973925, + "loss": 3.1145, + "theoretical_loss": 3.896863896110926, + "tokens_seen": 521931776 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004251654964894684, + "loss": 3.1483, + "theoretical_loss": 3.896811586416688, + "tokens_seen": 521997312 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004251554663991976, + "loss": 2.7649, + "theoretical_loss": 3.896759285128054, + "tokens_seen": 522062848 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004251454363089268, + "loss": 3.2757, + "theoretical_loss": 3.896706992242618, + "tokens_seen": 522128384 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425135406218656, + "loss": 3.0905, + "theoretical_loss": 3.896654707757976, + "tokens_seen": 522193920 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042512537612838516, + "loss": 3.0813, + "theoretical_loss": 3.896602431671724, + "tokens_seen": 522259456 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042511534603811434, + "loss": 3.1378, + "theoretical_loss": 3.896550163981459, + "tokens_seen": 522324992 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004251053159478435, + "loss": 3.1222, + "theoretical_loss": 3.89649790468478, + "tokens_seen": 522390528 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042509528585757276, + "loss": 3.1761, + "theoretical_loss": 3.8964456537792858, + "tokens_seen": 522456064 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004250852557673019, + "loss": 3.0964, + "theoretical_loss": 3.8963934112625775, + "tokens_seen": 522521600 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004250752256770311, + "loss": 3.0876, + "theoretical_loss": 3.8963411771322556, + "tokens_seen": 522587136 + }, + { + "epoch": 1.05, + "objective/train/docs_used": 856298, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.895815372467041, + "objective/train/theoretical_loss": 3.8963020070366383, + "objective/train/tokens_used": 543096288, + "theoretical_loss": 3.8963020070366383, + "tokens_seen": 522636288 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042506519558676024, + "loss": 3.0401, + "theoretical_loss": 3.896288951385923, + "tokens_seen": 522652672 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004250551654964895, + "loss": 3.129, + "theoretical_loss": 3.8962367340211825, + "tokens_seen": 522718208 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042504513540621866, + "loss": 3.0881, + "theoretical_loss": 3.8961845250356384, + "tokens_seen": 522783744 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042503510531594784, + "loss": 2.8997, + "theoretical_loss": 3.8961323244268957, + "tokens_seen": 522849280 + }, + { + "epoch": 1.05, + "learning_rate": 0.000425025075225677, + "loss": 2.9595, + "theoretical_loss": 3.8960801321925618, + "tokens_seen": 522914816 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004250150451354062, + "loss": 3.003, + "theoretical_loss": 3.896027948330243, + "tokens_seen": 522980352 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004250050150451354, + "loss": 3.0836, + "theoretical_loss": 3.895975772837547, + "tokens_seen": 523045888 + }, + { + "epoch": 1.05, + "learning_rate": 0.0004249949849548646, + "loss": 2.9302, + "theoretical_loss": 3.895923605712084, + "tokens_seen": 523111424 + }, + { + "epoch": 1.05, + "learning_rate": 0.00042498495486459375, + "loss": 3.0437, + "theoretical_loss": 3.895871446951464, + "tokens_seen": 523176960 + }, + { + "epoch": 1.06, + "learning_rate": 0.000424974924774323, + "loss": 3.1986, + "theoretical_loss": 3.895819296553298, + "tokens_seen": 523242496 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042496489468405216, + "loss": 3.015, + "theoretical_loss": 3.8957671545151977, + "tokens_seen": 523308032 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042495486459378135, + "loss": 3.123, + "theoretical_loss": 3.8957150208347766, + "tokens_seen": 523373568 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004249448345035105, + "loss": 3.0315, + "theoretical_loss": 3.8956628955096493, + "tokens_seen": 523439104 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004249348044132397, + "loss": 3.0312, + "theoretical_loss": 3.8956107785374297, + "tokens_seen": 523504640 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042492477432296894, + "loss": 3.2114, + "theoretical_loss": 3.8955586699157347, + "tokens_seen": 523570176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004249147442326981, + "loss": 3.1276, + "theoretical_loss": 3.8955065696421807, + "tokens_seen": 523635712 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004249047141424273, + "loss": 3.1327, + "theoretical_loss": 3.8954544777143867, + "tokens_seen": 523701248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004248946840521565, + "loss": 2.9579, + "theoretical_loss": 3.895402394129971, + "tokens_seen": 523766784 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042488465396188567, + "loss": 2.9913, + "theoretical_loss": 3.8953503188865533, + "tokens_seen": 523832320 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042487462387161485, + "loss": 3.0738, + "theoretical_loss": 3.895298251981756, + "tokens_seen": 523897856 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004248645937813441, + "loss": 3.1577, + "theoretical_loss": 3.8952461934131986, + "tokens_seen": 523963392 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004248545636910732, + "loss": 3.1692, + "theoretical_loss": 3.895194143178506, + "tokens_seen": 524028928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042484453360080245, + "loss": 3.2372, + "theoretical_loss": 3.8951421012753014, + "tokens_seen": 524094464 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004248345035105316, + "loss": 3.0927, + "theoretical_loss": 3.8950900677012097, + "tokens_seen": 524160000 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004248244734202608, + "loss": 3.1642, + "theoretical_loss": 3.895038042453856, + "tokens_seen": 524225536 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 859220, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.023667573928833, + "objective/train/theoretical_loss": 3.894999028981336, + "objective/train/tokens_used": 544734688, + "theoretical_loss": 3.894999028981336, + "tokens_seen": 524274688 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042481444332999, + "loss": 3.0786, + "theoretical_loss": 3.8949860255308684, + "tokens_seen": 524291072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042480441323971917, + "loss": 3.1049, + "theoretical_loss": 3.8949340169298736, + "tokens_seen": 524356608 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042479438314944835, + "loss": 3.0445, + "theoretical_loss": 3.8948820166485008, + "tokens_seen": 524422144 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247843530591776, + "loss": 3.0441, + "theoretical_loss": 3.8948300246843797, + "tokens_seen": 524487680 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247743229689067, + "loss": 3.093, + "theoretical_loss": 3.8947780410351407, + "tokens_seen": 524553216 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042476429287863595, + "loss": 3.1182, + "theoretical_loss": 3.8947260656984155, + "tokens_seen": 524618752 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247542627883651, + "loss": 2.8689, + "theoretical_loss": 3.894674098671837, + "tokens_seen": 524684288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247442326980943, + "loss": 3.0469, + "theoretical_loss": 3.8946221399530385, + "tokens_seen": 524749824 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247342026078235, + "loss": 3.2277, + "theoretical_loss": 3.894570189539655, + "tokens_seen": 524815360 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004247241725175527, + "loss": 3.0634, + "theoretical_loss": 3.8945182474293216, + "tokens_seen": 524880896 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042471414242728186, + "loss": 3.1776, + "theoretical_loss": 3.894466313619675, + "tokens_seen": 524946432 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042470411233701104, + "loss": 3.036, + "theoretical_loss": 3.8944143881083524, + "tokens_seen": 525011968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004246940822467402, + "loss": 3.1231, + "theoretical_loss": 3.8943624708929927, + "tokens_seen": 525077504 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042468405215646945, + "loss": 2.9871, + "theoretical_loss": 3.894310561971235, + "tokens_seen": 525143040 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004246740220661986, + "loss": 3.0548, + "theoretical_loss": 3.894258661340719, + "tokens_seen": 525208576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004246639919759278, + "loss": 3.0081, + "theoretical_loss": 3.894206768999087, + "tokens_seen": 525274112 + }, + { + "epoch": 1.06, + "learning_rate": 0.000424653961885657, + "loss": 3.021, + "theoretical_loss": 3.894154884943981, + "tokens_seen": 525339648 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004246439317953862, + "loss": 3.0781, + "theoretical_loss": 3.8941030091730444, + "tokens_seen": 525405184 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042463390170511536, + "loss": 3.2041, + "theoretical_loss": 3.894051141683921, + "tokens_seen": 525470720 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042462387161484454, + "loss": 2.9206, + "theoretical_loss": 3.8939992824742564, + "tokens_seen": 525536256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004246138415245737, + "loss": 3.0196, + "theoretical_loss": 3.8939474315416964, + "tokens_seen": 525601792 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042460381143430296, + "loss": 3.046, + "theoretical_loss": 3.893895588883889, + "tokens_seen": 525667328 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245937813440321, + "loss": 3.0075, + "theoretical_loss": 3.8938437544984805, + "tokens_seen": 525732864 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245837512537613, + "loss": 2.9131, + "theoretical_loss": 3.8937919283831217, + "tokens_seen": 525798400 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042457372116349044, + "loss": 3.2067, + "theoretical_loss": 3.893740110535462, + "tokens_seen": 525863936 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 862003, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1062052249908447, + "objective/train/theoretical_loss": 3.893701252573981, + "objective/train/tokens_used": 546373088, + "theoretical_loss": 3.893701252573981, + "tokens_seen": 525913088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245636910732197, + "loss": 3.0338, + "theoretical_loss": 3.8936883009531513, + "tokens_seen": 525929472 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042455366098294886, + "loss": 3.181, + "theoretical_loss": 3.893636499633843, + "tokens_seen": 525995008 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042454363089267804, + "loss": 3.1015, + "theoretical_loss": 3.8935847065751896, + "tokens_seen": 526060544 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245336008024072, + "loss": 3.0025, + "theoretical_loss": 3.8935329217748444, + "tokens_seen": 526126080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245235707121364, + "loss": 3.2805, + "theoretical_loss": 3.8934811452304627, + "tokens_seen": 526191616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245135406218656, + "loss": 3.08, + "theoretical_loss": 3.8934293769397, + "tokens_seen": 526257152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004245035105315948, + "loss": 3.184, + "theoretical_loss": 3.8933776169002137, + "tokens_seen": 526322688 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042449348044132395, + "loss": 3.231, + "theoretical_loss": 3.8933258651096603, + "tokens_seen": 526388224 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244834503510532, + "loss": 3.0771, + "theoretical_loss": 3.8932741215656987, + "tokens_seen": 526453760 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042447342026078236, + "loss": 3.0896, + "theoretical_loss": 3.8932223862659896, + "tokens_seen": 526519296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042446339017051155, + "loss": 2.9603, + "theoretical_loss": 3.893170659208192, + "tokens_seen": 526584832 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244533600802407, + "loss": 3.0469, + "theoretical_loss": 3.893118940389969, + "tokens_seen": 526650368 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244433299899699, + "loss": 3.0393, + "theoretical_loss": 3.893067229808981, + "tokens_seen": 526715904 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244332998996991, + "loss": 3.1272, + "theoretical_loss": 3.893015527462893, + "tokens_seen": 526781440 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244232698094283, + "loss": 3.0476, + "theoretical_loss": 3.892963833349369, + "tokens_seen": 526846976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042441323971915745, + "loss": 3.1069, + "theoretical_loss": 3.892912147466074, + "tokens_seen": 526912512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004244032096288867, + "loss": 3.0267, + "theoretical_loss": 3.892860469810674, + "tokens_seen": 526978048 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243931795386158, + "loss": 3.1267, + "theoretical_loss": 3.8928088003808377, + "tokens_seen": 527043584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042438314944834505, + "loss": 3.1184, + "theoretical_loss": 3.8927571391742313, + "tokens_seen": 527109120 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042437311935807423, + "loss": 3.2218, + "theoretical_loss": 3.8927054861885253, + "tokens_seen": 527174656 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243630892678034, + "loss": 3.1813, + "theoretical_loss": 3.8926538414213887, + "tokens_seen": 527240192 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243530591775326, + "loss": 2.9503, + "theoretical_loss": 3.8926022048704936, + "tokens_seen": 527305728 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243430290872618, + "loss": 3.0827, + "theoretical_loss": 3.8925505765335116, + "tokens_seen": 527371264 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042433299899699095, + "loss": 3.0799, + "theoretical_loss": 3.8924989564081147, + "tokens_seen": 527436800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243229689067202, + "loss": 3.1281, + "theoretical_loss": 3.892447344491978, + "tokens_seen": 527502336 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 864669, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.070223093032837, + "objective/train/theoretical_loss": 3.8924086409408027, + "objective/train/tokens_used": 548011488, + "theoretical_loss": 3.8924086409408027, + "tokens_seen": 527551488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004243129388164493, + "loss": 3.1335, + "theoretical_loss": 3.892395740782775, + "tokens_seen": 527567872 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042430290872617855, + "loss": 3.0443, + "theoretical_loss": 3.892344145278183, + "tokens_seen": 527633408 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042429287863590773, + "loss": 3.0411, + "theoretical_loss": 3.8922925579758783, + "tokens_seen": 527698944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242828485456369, + "loss": 3.155, + "theoretical_loss": 3.892240978873537, + "tokens_seen": 527764480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242728184553661, + "loss": 3.1072, + "theoretical_loss": 3.8921894079688397, + "tokens_seen": 527830016 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242627883650953, + "loss": 3.0713, + "theoretical_loss": 3.892137845259465, + "tokens_seen": 527895552 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042425275827482446, + "loss": 2.9263, + "theoretical_loss": 3.8920862907430935, + "tokens_seen": 527961088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242427281845537, + "loss": 3.0613, + "theoretical_loss": 3.8920347444174066, + "tokens_seen": 528026624 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242326980942828, + "loss": 3.0896, + "theoretical_loss": 3.891983206280086, + "tokens_seen": 528092160 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042422266800401206, + "loss": 3.1805, + "theoretical_loss": 3.8919316763288165, + "tokens_seen": 528157696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242126379137412, + "loss": 2.9096, + "theoretical_loss": 3.8918801545612816, + "tokens_seen": 528223232 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004242026078234704, + "loss": 3.1811, + "theoretical_loss": 3.8918286409751657, + "tokens_seen": 528288768 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241925777331996, + "loss": 3.0851, + "theoretical_loss": 3.891777135568156, + "tokens_seen": 528354304 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241825476429288, + "loss": 3.1462, + "theoretical_loss": 3.891725638337939, + "tokens_seen": 528419840 + }, + { + "epoch": 1.06, + "learning_rate": 0.000424172517552658, + "loss": 3.1474, + "theoretical_loss": 3.8916741492822036, + "tokens_seen": 528485376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241624874623872, + "loss": 3.0128, + "theoretical_loss": 3.8916226683986377, + "tokens_seen": 528550912 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241524573721164, + "loss": 3.1222, + "theoretical_loss": 3.8915711956849317, + "tokens_seen": 528616448 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042414242728184556, + "loss": 3.0659, + "theoretical_loss": 3.891519731138776, + "tokens_seen": 528681984 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042413239719157474, + "loss": 3.1115, + "theoretical_loss": 3.8914682747578633, + "tokens_seen": 528747520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241223671013039, + "loss": 3.0555, + "theoretical_loss": 3.891416826539886, + "tokens_seen": 528813056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042411233701103316, + "loss": 3.0332, + "theoretical_loss": 3.8913653864825366, + "tokens_seen": 528878592 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004241023069207623, + "loss": 3.2897, + "theoretical_loss": 3.891313954583511, + "tokens_seen": 528944128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004240922768304915, + "loss": 3.0842, + "theoretical_loss": 3.891262530840504, + "tokens_seen": 529009664 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042408224674022065, + "loss": 3.1192, + "theoretical_loss": 3.891211115251213, + "tokens_seen": 529075200 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004240722166499499, + "loss": 3.0528, + "theoretical_loss": 3.8911597078133346, + "tokens_seen": 529140736 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 866091, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.03365421295166, + "objective/train/theoretical_loss": 3.891121157582906, + "objective/train/tokens_used": 549649888, + "theoretical_loss": 3.891121157582906, + "tokens_seen": 529189888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042406218655967906, + "loss": 3.1028, + "theoretical_loss": 3.891108308524567, + "tokens_seen": 529206272 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042405215646940824, + "loss": 3.1007, + "theoretical_loss": 3.89105691738261, + "tokens_seen": 529271808 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004240421263791374, + "loss": 3.0175, + "theoretical_loss": 3.8910055343851635, + "tokens_seen": 529337344 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004240320962888666, + "loss": 3.1857, + "theoretical_loss": 3.890954159529929, + "tokens_seen": 529402880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004240220661985958, + "loss": 3.0392, + "theoretical_loss": 3.8909027928146083, + "tokens_seen": 529468416 + }, + { + "epoch": 1.06, + "learning_rate": 0.000424012036108325, + "loss": 3.1238, + "theoretical_loss": 3.8908514342369047, + "tokens_seen": 529533952 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042400200601805415, + "loss": 3.1713, + "theoretical_loss": 3.890800083794521, + "tokens_seen": 529599488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239919759277834, + "loss": 3.0919, + "theoretical_loss": 3.8907487414851634, + "tokens_seen": 529665024 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042398194583751256, + "loss": 2.9924, + "theoretical_loss": 3.8906974073065372, + "tokens_seen": 529730560 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042397191574724175, + "loss": 3.2125, + "theoretical_loss": 3.890646081256349, + "tokens_seen": 529796096 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239618856569709, + "loss": 3.1411, + "theoretical_loss": 3.890594763332307, + "tokens_seen": 529861632 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239518555667001, + "loss": 3.108, + "theoretical_loss": 3.8905434535321186, + "tokens_seen": 529927168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239418254764293, + "loss": 3.0747, + "theoretical_loss": 3.890492151853495, + "tokens_seen": 529992704 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239317953861585, + "loss": 3.1824, + "theoretical_loss": 3.890440858294145, + "tokens_seen": 530058240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042392176529588765, + "loss": 3.1639, + "theoretical_loss": 3.8903895728517814, + "tokens_seen": 530123776 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004239117352056169, + "loss": 3.0125, + "theoretical_loss": 3.8903382955241157, + "tokens_seen": 530189312 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423901705115346, + "loss": 3.0438, + "theoretical_loss": 3.8902870263088607, + "tokens_seen": 530254848 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042389167502507525, + "loss": 3.1333, + "theoretical_loss": 3.890235765203732, + "tokens_seen": 530320384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042388164493480443, + "loss": 3.1788, + "theoretical_loss": 3.8901845122064436, + "tokens_seen": 530385920 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004238716148445336, + "loss": 3.1721, + "theoretical_loss": 3.8901332673147113, + "tokens_seen": 530451456 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004238615847542628, + "loss": 3.0599, + "theoretical_loss": 3.890082030526253, + "tokens_seen": 530516992 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423851554663992, + "loss": 3.0809, + "theoretical_loss": 3.890030801838786, + "tokens_seen": 530582528 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042384152457372115, + "loss": 3.0511, + "theoretical_loss": 3.889979581250029, + "tokens_seen": 530648064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004238314944834504, + "loss": 3.0419, + "theoretical_loss": 3.889928368757702, + "tokens_seen": 530713600 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004238214643931795, + "loss": 3.0345, + "theoretical_loss": 3.8898771643595254, + "tokens_seen": 530779136 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 868661, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3108785152435303, + "objective/train/theoretical_loss": 3.8898387663713083, + "objective/train/tokens_used": 551288288, + "theoretical_loss": 3.8898387663713083, + "tokens_seen": 530828288 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042381143430290875, + "loss": 3.0044, + "theoretical_loss": 3.889825968053221, + "tokens_seen": 530844672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042380140421263793, + "loss": 3.1965, + "theoretical_loss": 3.8897747798365105, + "tokens_seen": 530910208 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237913741223671, + "loss": 3.0097, + "theoretical_loss": 3.8897235997071187, + "tokens_seen": 530975744 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237813440320963, + "loss": 3.2095, + "theoretical_loss": 3.8896724276627683, + "tokens_seen": 531041280 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237713139418255, + "loss": 3.1279, + "theoretical_loss": 3.889621263701186, + "tokens_seen": 531106816 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042376128385155466, + "loss": 3.0161, + "theoretical_loss": 3.8895701078200973, + "tokens_seen": 531172352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237512537612839, + "loss": 3.1255, + "theoretical_loss": 3.8895189600172295, + "tokens_seen": 531237888 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423741223671013, + "loss": 3.0144, + "theoretical_loss": 3.88946782029031, + "tokens_seen": 531303424 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042373119358074226, + "loss": 3.0716, + "theoretical_loss": 3.889416688637069, + "tokens_seen": 531368960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237211634904714, + "loss": 2.9804, + "theoretical_loss": 3.889365565055235, + "tokens_seen": 531434496 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237111334002006, + "loss": 3.1777, + "theoretical_loss": 3.889314449542539, + "tokens_seen": 531500032 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004237011033099298, + "loss": 3.099, + "theoretical_loss": 3.8892633420967133, + "tokens_seen": 531565568 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423691073219659, + "loss": 2.9944, + "theoretical_loss": 3.88921224271549, + "tokens_seen": 531631104 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042368104312938816, + "loss": 3.0936, + "theoretical_loss": 3.8891611513966033, + "tokens_seen": 531696640 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236710130391174, + "loss": 3.0848, + "theoretical_loss": 3.8891100681377866, + "tokens_seen": 531762176 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236609829488465, + "loss": 3.0272, + "theoretical_loss": 3.889058992936776, + "tokens_seen": 531827712 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042365095285857576, + "loss": 3.1648, + "theoretical_loss": 3.889007925791307, + "tokens_seen": 531893248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236409227683049, + "loss": 3.1644, + "theoretical_loss": 3.888956866699118, + "tokens_seen": 531958784 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236308926780341, + "loss": 2.9636, + "theoretical_loss": 3.888905815657946, + "tokens_seen": 532024320 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236208625877633, + "loss": 3.0646, + "theoretical_loss": 3.88885477266553, + "tokens_seen": 532089856 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004236108324974925, + "loss": 2.9973, + "theoretical_loss": 3.888803737719611, + "tokens_seen": 532155392 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042360080240722166, + "loss": 3.0342, + "theoretical_loss": 3.888752710817929, + "tokens_seen": 532220928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042359077231695085, + "loss": 3.1409, + "theoretical_loss": 3.888701691958226, + "tokens_seen": 532286464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042358074222668, + "loss": 3.0212, + "theoretical_loss": 3.888650681138244, + "tokens_seen": 532352000 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042357071213640926, + "loss": 3.1478, + "theoretical_loss": 3.8885996783557273, + "tokens_seen": 532417536 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 871511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1881723403930664, + "objective/train/theoretical_loss": 3.88856143154207, + "objective/train/tokens_used": 552926688, + "theoretical_loss": 3.88856143154207, + "tokens_seen": 532466688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004235606820461384, + "loss": 3.1695, + "theoretical_loss": 3.8885486836084207, + "tokens_seen": 532483072 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004235506519558676, + "loss": 3.113, + "theoretical_loss": 3.8884976968940683, + "tokens_seen": 532548608 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042354062186559675, + "loss": 2.9714, + "theoretical_loss": 3.888446718210418, + "tokens_seen": 532614144 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423530591775326, + "loss": 2.7471, + "theoretical_loss": 3.8883957475552156, + "tokens_seen": 532679680 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042352056168505517, + "loss": 2.7192, + "theoretical_loss": 3.8883447849262094, + "tokens_seen": 532745216 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042351053159478435, + "loss": 3.144, + "theoretical_loss": 3.8882938303211496, + "tokens_seen": 532810752 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042350050150451353, + "loss": 3.067, + "theoretical_loss": 3.888242883737785, + "tokens_seen": 532876288 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042349047141424276, + "loss": 3.0395, + "theoretical_loss": 3.8881919451738662, + "tokens_seen": 532941824 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004234804413239719, + "loss": 2.9579, + "theoretical_loss": 3.8881410146271467, + "tokens_seen": 533007360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042347041123370113, + "loss": 3.1064, + "theoretical_loss": 3.888090092095377, + "tokens_seen": 533072896 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042346038114343025, + "loss": 3.0399, + "theoretical_loss": 3.888039177576312, + "tokens_seen": 533138432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004234503510531595, + "loss": 3.1383, + "theoretical_loss": 3.887988271067706, + "tokens_seen": 533203968 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042344032096288867, + "loss": 3.1789, + "theoretical_loss": 3.887937372567314, + "tokens_seen": 533269504 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042343029087261785, + "loss": 2.9265, + "theoretical_loss": 3.8878864820728922, + "tokens_seen": 533335040 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004234202607823471, + "loss": 3.0101, + "theoretical_loss": 3.887835599582198, + "tokens_seen": 533400576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004234102306920762, + "loss": 2.8948, + "theoretical_loss": 3.88778472509299, + "tokens_seen": 533466112 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042340020060180545, + "loss": 3.103, + "theoretical_loss": 3.887733858603027, + "tokens_seen": 533531648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042339017051153463, + "loss": 2.9, + "theoretical_loss": 3.8876830001100675, + "tokens_seen": 533597184 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004233801404212638, + "loss": 2.9454, + "theoretical_loss": 3.887632149611874, + "tokens_seen": 533662720 + }, + { + "epoch": 1.06, + "learning_rate": 0.000423370110330993, + "loss": 3.1061, + "theoretical_loss": 3.8875813071062075, + "tokens_seen": 533728256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004233600802407222, + "loss": 2.9736, + "theoretical_loss": 3.8875304725908304, + "tokens_seen": 533793792 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042335005015045135, + "loss": 3.1358, + "theoretical_loss": 3.8874796460635066, + "tokens_seen": 533859328 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004233400200601806, + "loss": 3.1536, + "theoretical_loss": 3.887428827522001, + "tokens_seen": 533924864 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004233299899699097, + "loss": 3.0268, + "theoretical_loss": 3.8873780169640773, + "tokens_seen": 533990400 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042331995987963895, + "loss": 3.1806, + "theoretical_loss": 3.8873272143875033, + "tokens_seen": 534055936 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 874112, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.037367820739746, + "objective/train/theoretical_loss": 3.8872891176914894, + "objective/train/tokens_used": 554565088, + "theoretical_loss": 3.8872891176914894, + "tokens_seen": 534105088 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042330992978936813, + "loss": 3.1187, + "theoretical_loss": 3.8872764197900453, + "tokens_seen": 534121472 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232998996990973, + "loss": 2.9596, + "theoretical_loss": 3.8872256331694715, + "tokens_seen": 534187008 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232898696088265, + "loss": 3.0873, + "theoretical_loss": 3.8871748545235505, + "tokens_seen": 534252544 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232798395185557, + "loss": 3.1627, + "theoretical_loss": 3.8871240838500523, + "tokens_seen": 534318080 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042326980942828486, + "loss": 3.0254, + "theoretical_loss": 3.8870733211467483, + "tokens_seen": 534383616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232597793380141, + "loss": 2.9835, + "theoretical_loss": 3.8870225664114084, + "tokens_seen": 534449152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232497492477432, + "loss": 3.215, + "theoretical_loss": 3.886971819641807, + "tokens_seen": 534514688 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042323971915747246, + "loss": 3.1164, + "theoretical_loss": 3.886921080835716, + "tokens_seen": 534580224 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232296890672016, + "loss": 3.0743, + "theoretical_loss": 3.88687034999091, + "tokens_seen": 534645760 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004232196589769308, + "loss": 2.9643, + "theoretical_loss": 3.8868196271051643, + "tokens_seen": 534711296 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042320962888666, + "loss": 3.203, + "theoretical_loss": 3.8867689121762554, + "tokens_seen": 534776832 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231995987963892, + "loss": 3.0115, + "theoretical_loss": 3.88671820520196, + "tokens_seen": 534842368 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042318956870611836, + "loss": 3.0316, + "theoretical_loss": 3.886667506180056, + "tokens_seen": 534907904 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231795386158476, + "loss": 3.045, + "theoretical_loss": 3.8866168151083214, + "tokens_seen": 534973440 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231695085255767, + "loss": 2.879, + "theoretical_loss": 3.886566131984537, + "tokens_seen": 535038976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042315947843530596, + "loss": 2.9962, + "theoretical_loss": 3.8865154568064826, + "tokens_seen": 535104512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231494483450351, + "loss": 2.9844, + "theoretical_loss": 3.8864647895719395, + "tokens_seen": 535170048 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231394182547643, + "loss": 3.0645, + "theoretical_loss": 3.8864141302786903, + "tokens_seen": 535235584 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231293881644935, + "loss": 3.2348, + "theoretical_loss": 3.886363478924518, + "tokens_seen": 535301120 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004231193580742227, + "loss": 3.1084, + "theoretical_loss": 3.8863128355072076, + "tokens_seen": 535366656 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042310932798395186, + "loss": 3.0648, + "theoretical_loss": 3.886262200024543, + "tokens_seen": 535432192 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042309929789368105, + "loss": 3.0643, + "theoretical_loss": 3.88621157247431, + "tokens_seen": 535497728 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004230892678034102, + "loss": 3.1472, + "theoretical_loss": 3.8861609528542957, + "tokens_seen": 535563264 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042307923771313946, + "loss": 2.9721, + "theoretical_loss": 3.886110341162288, + "tokens_seen": 535628800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004230692076228686, + "loss": 3.1948, + "theoretical_loss": 3.8860597373960752, + "tokens_seen": 535694336 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 877004, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7617154121398926, + "objective/train/theoretical_loss": 3.886021789771389, + "objective/train/tokens_used": 556203488, + "theoretical_loss": 3.886021789771389, + "tokens_seen": 535743488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004230591775325978, + "loss": 2.9764, + "theoretical_loss": 3.886009141553447, + "tokens_seen": 535759872 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042304914744232695, + "loss": 3.0939, + "theoretical_loss": 3.885958553632193, + "tokens_seen": 535825408 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004230391173520562, + "loss": 3.0027, + "theoretical_loss": 3.885907973630105, + "tokens_seen": 535890944 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042302908726178537, + "loss": 2.9643, + "theoretical_loss": 3.885857401544975, + "tokens_seen": 535956480 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042301905717151455, + "loss": 3.1419, + "theoretical_loss": 3.885806837374596, + "tokens_seen": 536022016 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042300902708124373, + "loss": 3.0092, + "theoretical_loss": 3.8857562811167616, + "tokens_seen": 536087552 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042299899699097297, + "loss": 3.06, + "theoretical_loss": 3.8857057327692672, + "tokens_seen": 536153088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004229889669007021, + "loss": 2.9589, + "theoretical_loss": 3.8856551923299074, + "tokens_seen": 536218624 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042297893681043133, + "loss": 2.9644, + "theoretical_loss": 3.8856046597964786, + "tokens_seen": 536284160 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042296890672016045, + "loss": 3.2111, + "theoretical_loss": 3.885554135166779, + "tokens_seen": 536349696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004229588766298897, + "loss": 3.1416, + "theoretical_loss": 3.885503618438607, + "tokens_seen": 536415232 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042294884653961887, + "loss": 2.9617, + "theoretical_loss": 3.8854531096097613, + "tokens_seen": 536480768 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042293881644934805, + "loss": 2.9247, + "theoretical_loss": 3.885402608678042, + "tokens_seen": 536546304 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042292878635907723, + "loss": 3.071, + "theoretical_loss": 3.88535211564125, + "tokens_seen": 536611840 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004229187562688064, + "loss": 3.0962, + "theoretical_loss": 3.885301630497187, + "tokens_seen": 536677376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004229087261785356, + "loss": 3.0443, + "theoretical_loss": 3.885251153243656, + "tokens_seen": 536742912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042289869608826483, + "loss": 2.9007, + "theoretical_loss": 3.88520068387846, + "tokens_seen": 536808448 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042288866599799396, + "loss": 3.1494, + "theoretical_loss": 3.885150222399404, + "tokens_seen": 536873984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004228786359077232, + "loss": 2.9524, + "theoretical_loss": 3.885099768804293, + "tokens_seen": 536939520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004228686058174523, + "loss": 2.8923, + "theoretical_loss": 3.885049323090933, + "tokens_seen": 537005056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042285857572718155, + "loss": 3.2114, + "theoretical_loss": 3.884998885257132, + "tokens_seen": 537070592 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042284854563691074, + "loss": 3.1636, + "theoretical_loss": 3.8849484553006968, + "tokens_seen": 537136128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004228385155466399, + "loss": 2.9027, + "theoretical_loss": 3.8848980332194367, + "tokens_seen": 537201664 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004228284854563691, + "loss": 3.1972, + "theoretical_loss": 3.884847619011161, + "tokens_seen": 537267200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042281845536609833, + "loss": 3.0832, + "theoretical_loss": 3.8847972126736816, + "tokens_seen": 537332736 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 879748, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9367454051971436, + "objective/train/theoretical_loss": 3.8847594130844643, + "objective/train/tokens_used": 557841888, + "theoretical_loss": 3.8847594130844643, + "tokens_seen": 537381888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042280842527582746, + "loss": 3.154, + "theoretical_loss": 3.8847468142048083, + "tokens_seen": 537398272 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227983951855567, + "loss": 3.079, + "theoretical_loss": 3.884696423602355, + "tokens_seen": 537463808 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227883650952858, + "loss": 2.9072, + "theoretical_loss": 3.8846460408641335, + "tokens_seen": 537529344 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042277833500501506, + "loss": 3.2667, + "theoretical_loss": 3.884595665987958, + "tokens_seen": 537594880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042276830491474424, + "loss": 3.0271, + "theoretical_loss": 3.884545298971644, + "tokens_seen": 537660416 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227582748244734, + "loss": 2.8902, + "theoretical_loss": 3.884494939813008, + "tokens_seen": 537725952 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227482447342026, + "loss": 3.2346, + "theoretical_loss": 3.884444588509865, + "tokens_seen": 537791488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227382146439318, + "loss": 3.1566, + "theoretical_loss": 3.884394245060034, + "tokens_seen": 537857024 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042272818455366096, + "loss": 3.1053, + "theoretical_loss": 3.8843439094613323, + "tokens_seen": 537922560 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227181544633902, + "loss": 3.028, + "theoretical_loss": 3.8842935817115807, + "tokens_seen": 537988096 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004227081243731193, + "loss": 3.0111, + "theoretical_loss": 3.884243261808598, + "tokens_seen": 538053632 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042269809428284856, + "loss": 3.1113, + "theoretical_loss": 3.884192949750206, + "tokens_seen": 538119168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226880641925777, + "loss": 2.99, + "theoretical_loss": 3.8841426455342263, + "tokens_seen": 538184704 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226780341023069, + "loss": 3.1692, + "theoretical_loss": 3.8840923491584816, + "tokens_seen": 538250240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042266800401203616, + "loss": 3.1093, + "theoretical_loss": 3.8840420606207955, + "tokens_seen": 538315776 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226579739217653, + "loss": 3.1406, + "theoretical_loss": 3.8839917799189934, + "tokens_seen": 538381312 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226479438314945, + "loss": 3.051, + "theoretical_loss": 3.8839415070509, + "tokens_seen": 538446848 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226379137412237, + "loss": 3.0939, + "theoretical_loss": 3.883891242014341, + "tokens_seen": 538512384 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004226278836509529, + "loss": 2.8974, + "theoretical_loss": 3.8838409848071453, + "tokens_seen": 538577920 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042261785356068206, + "loss": 3.1371, + "theoretical_loss": 3.883790735427139, + "tokens_seen": 538643456 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042260782347041125, + "loss": 3.1025, + "theoretical_loss": 3.8837404938721516, + "tokens_seen": 538708992 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004225977933801404, + "loss": 3.1657, + "theoretical_loss": 3.8836902601400136, + "tokens_seen": 538774528 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042258776328986966, + "loss": 3.0804, + "theoretical_loss": 3.883640034228555, + "tokens_seen": 538840064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004225777331995988, + "loss": 3.1426, + "theoretical_loss": 3.883589816135607, + "tokens_seen": 538905600 + }, + { + "epoch": 1.06, + "learning_rate": 0.000422567703109328, + "loss": 2.9977, + "theoretical_loss": 3.883539605859002, + "tokens_seen": 538971136 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 882573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.253732681274414, + "objective/train/theoretical_loss": 3.88350195327972, + "objective/train/tokens_used": 559480288, + "theoretical_loss": 3.88350195327972, + "tokens_seen": 539020288 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042255767301905715, + "loss": 2.931, + "theoretical_loss": 3.883489403396574, + "tokens_seen": 539036672 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004225476429287864, + "loss": 3.023, + "theoretical_loss": 3.8834392087461556, + "tokens_seen": 539102208 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042253761283851557, + "loss": 3.0665, + "theoretical_loss": 3.8833890219055833, + "tokens_seen": 539167744 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042252758274824475, + "loss": 3.1727, + "theoretical_loss": 3.8833388428726923, + "tokens_seen": 539233280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042251755265797393, + "loss": 3.0099, + "theoretical_loss": 3.8832886716453188, + "tokens_seen": 539298816 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042250752256770317, + "loss": 2.8854, + "theoretical_loss": 3.883238508221301, + "tokens_seen": 539364352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004224974924774323, + "loss": 2.861, + "theoretical_loss": 3.8831883525984763, + "tokens_seen": 539429888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042248746238716153, + "loss": 3.102, + "theoretical_loss": 3.883138204774685, + "tokens_seen": 539495424 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042247743229689065, + "loss": 2.8898, + "theoretical_loss": 3.8830880647477666, + "tokens_seen": 539560960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004224674022066199, + "loss": 2.8693, + "theoretical_loss": 3.8830379325155624, + "tokens_seen": 539626496 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042245737211634907, + "loss": 3.2424, + "theoretical_loss": 3.8829878080759137, + "tokens_seen": 539692032 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042244734202607825, + "loss": 3.0439, + "theoretical_loss": 3.882937691426664, + "tokens_seen": 539757568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042243731193580743, + "loss": 3.169, + "theoretical_loss": 3.8828875825656564, + "tokens_seen": 539823104 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004224272818455366, + "loss": 2.9302, + "theoretical_loss": 3.8828374814907347, + "tokens_seen": 539888640 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004224172517552658, + "loss": 3.0732, + "theoretical_loss": 3.8827873881997457, + "tokens_seen": 539954176 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042240722166499503, + "loss": 3.2992, + "theoretical_loss": 3.8827373026905336, + "tokens_seen": 540019712 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042239719157472416, + "loss": 3.0527, + "theoretical_loss": 3.8826872249609465, + "tokens_seen": 540085248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004223871614844534, + "loss": 3.0575, + "theoretical_loss": 3.882637155008832, + "tokens_seen": 540150784 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004223771313941825, + "loss": 3.0293, + "theoretical_loss": 3.882587092832039, + "tokens_seen": 540216320 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042236710130391176, + "loss": 3.0086, + "theoretical_loss": 3.8825370384284166, + "tokens_seen": 540281856 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042235707121364094, + "loss": 3.0995, + "theoretical_loss": 3.8824869917958154, + "tokens_seen": 540347392 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004223470411233701, + "loss": 3.0127, + "theoretical_loss": 3.8824369529320872, + "tokens_seen": 540412928 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004223370110330993, + "loss": 3.1761, + "theoretical_loss": 3.882386921835083, + "tokens_seen": 540478464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042232698094282853, + "loss": 2.9028, + "theoretical_loss": 3.8823368985026567, + "tokens_seen": 540544000 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042231695085255766, + "loss": 3.0906, + "theoretical_loss": 3.8822868829326618, + "tokens_seen": 540609536 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 885364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0970704555511475, + "objective/train/theoretical_loss": 3.8822493763479704, + "objective/train/tokens_used": 561118688, + "theoretical_loss": 3.8822493763479704, + "tokens_seen": 540658688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004223069207622869, + "loss": 3.1621, + "theoretical_loss": 3.8822368751229526, + "tokens_seen": 540675072 + }, + { + "epoch": 1.06, + "learning_rate": 0.000422296890672016, + "loss": 2.8989, + "theoretical_loss": 3.882186875071385, + "tokens_seen": 540740608 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042228686058174526, + "loss": 3.0994, + "theoretical_loss": 3.882136882775815, + "tokens_seen": 540806144 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042227683049147444, + "loss": 2.9901, + "theoretical_loss": 3.8820868982341006, + "tokens_seen": 540871680 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004222668004012036, + "loss": 3.1014, + "theoretical_loss": 3.8820369214440995, + "tokens_seen": 540937216 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004222567703109328, + "loss": 3.1244, + "theoretical_loss": 3.88198695240367, + "tokens_seen": 541002752 + }, + { + "epoch": 1.06, + "learning_rate": 0.000422246740220662, + "loss": 3.2448, + "theoretical_loss": 3.881936991110673, + "tokens_seen": 541068288 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042223671013039116, + "loss": 2.9837, + "theoretical_loss": 3.881887037562968, + "tokens_seen": 541133824 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004222266800401204, + "loss": 3.0445, + "theoretical_loss": 3.8818370917584177, + "tokens_seen": 541199360 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004222166499498495, + "loss": 3.2427, + "theoretical_loss": 3.881787153694883, + "tokens_seen": 541264896 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042220661985957876, + "loss": 2.8575, + "theoretical_loss": 3.8817372233702283, + "tokens_seen": 541330432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221965897693079, + "loss": 3.1588, + "theoretical_loss": 3.8816873007823167, + "tokens_seen": 541395968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221865596790371, + "loss": 3.1097, + "theoretical_loss": 3.881637385929014, + "tokens_seen": 541461504 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221765295887663, + "loss": 2.9693, + "theoretical_loss": 3.881587478808185, + "tokens_seen": 541527040 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221664994984955, + "loss": 3.0403, + "theoretical_loss": 3.8815375794176967, + "tokens_seen": 541592576 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042215646940822467, + "loss": 2.9872, + "theoretical_loss": 3.881487687755417, + "tokens_seen": 541658112 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221464393179539, + "loss": 3.0558, + "theoretical_loss": 3.881437803819213, + "tokens_seen": 541723648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042213640922768303, + "loss": 3.01, + "theoretical_loss": 3.881387927606955, + "tokens_seen": 541789184 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042212637913741226, + "loss": 3.1126, + "theoretical_loss": 3.881338059116512, + "tokens_seen": 541854720 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221163490471414, + "loss": 3.1592, + "theoretical_loss": 3.8812881983457554, + "tokens_seen": 541920256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004221063189568706, + "loss": 2.8882, + "theoretical_loss": 3.8812383452925565, + "tokens_seen": 541985792 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004220962888665998, + "loss": 3.1343, + "theoretical_loss": 3.8811884999547877, + "tokens_seen": 542051328 + }, + { + "epoch": 1.06, + "learning_rate": 0.000422086258776329, + "loss": 3.1753, + "theoretical_loss": 3.881138662330323, + "tokens_seen": 542116864 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042207622868605817, + "loss": 3.0445, + "theoretical_loss": 3.881088832417036, + "tokens_seen": 542182400 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042206619859578735, + "loss": 2.9565, + "theoretical_loss": 3.8810390102128016, + "tokens_seen": 542247936 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 886740, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.920203447341919, + "objective/train/theoretical_loss": 3.8810016486174135, + "objective/train/tokens_used": 562757088, + "theoretical_loss": 3.8810016486174135, + "tokens_seen": 542297088 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042205616850551653, + "loss": 3.0331, + "theoretical_loss": 3.880989195715496, + "tokens_seen": 542313472 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042204613841524577, + "loss": 3.0207, + "theoretical_loss": 3.8809393889229957, + "tokens_seen": 542379008 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004220361083249749, + "loss": 2.9043, + "theoretical_loss": 3.880889589833178, + "tokens_seen": 542444544 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042202607823470413, + "loss": 2.8862, + "theoretical_loss": 3.8808397984439225, + "tokens_seen": 542510080 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004220160481444333, + "loss": 3.1715, + "theoretical_loss": 3.8807900147531065, + "tokens_seen": 542575616 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004220060180541625, + "loss": 3.1682, + "theoretical_loss": 3.8807402387586114, + "tokens_seen": 542641152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004219959879638917, + "loss": 2.9896, + "theoretical_loss": 3.880690470458318, + "tokens_seen": 542706688 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042198595787362085, + "loss": 3.2723, + "theoretical_loss": 3.8806407098501072, + "tokens_seen": 542772224 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042197592778335004, + "loss": 3.1645, + "theoretical_loss": 3.8805909569318633, + "tokens_seen": 542837760 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042196589769307927, + "loss": 3.0914, + "theoretical_loss": 3.880541211701468, + "tokens_seen": 542903296 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004219558676028084, + "loss": 3.1241, + "theoretical_loss": 3.880491474156806, + "tokens_seen": 542968832 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042194583751253763, + "loss": 3.2102, + "theoretical_loss": 3.8804417442957626, + "tokens_seen": 543034368 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042193580742226676, + "loss": 3.08, + "theoretical_loss": 3.8803920221162236, + "tokens_seen": 543099904 + }, + { + "epoch": 1.06, + "learning_rate": 0.000421925777331996, + "loss": 3.1196, + "theoretical_loss": 3.880342307616076, + "tokens_seen": 543165440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042191574724172523, + "loss": 3.2464, + "theoretical_loss": 3.880292600793207, + "tokens_seen": 543230976 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042190571715145436, + "loss": 3.016, + "theoretical_loss": 3.880242901645506, + "tokens_seen": 543296512 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218956870611836, + "loss": 2.9737, + "theoretical_loss": 3.8801932101708605, + "tokens_seen": 543362048 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218856569709127, + "loss": 3.2015, + "theoretical_loss": 3.8801435263671626, + "tokens_seen": 543427584 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042187562688064196, + "loss": 3.0433, + "theoretical_loss": 3.8800938502323015, + "tokens_seen": 543493120 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042186559679037114, + "loss": 2.9853, + "theoretical_loss": 3.88004418176417, + "tokens_seen": 543558656 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218555667001003, + "loss": 3.0819, + "theoretical_loss": 3.8799945209606603, + "tokens_seen": 543624192 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218455366098295, + "loss": 3.0336, + "theoretical_loss": 3.879944867819667, + "tokens_seen": 543689728 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042183550651955873, + "loss": 3.0895, + "theoretical_loss": 3.8798952223390826, + "tokens_seen": 543755264 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042182547642928786, + "loss": 3.0031, + "theoretical_loss": 3.879845584516803, + "tokens_seen": 543820800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218154463390171, + "loss": 3.0446, + "theoretical_loss": 3.8797959543507243, + "tokens_seen": 543886336 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 889671, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2119944095611572, + "objective/train/theoretical_loss": 3.879758736749282, + "objective/train/tokens_used": 564395488, + "theoretical_loss": 3.879758736749282, + "tokens_seen": 543935488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004218054162487462, + "loss": 3.194, + "theoretical_loss": 3.879746331838743, + "tokens_seen": 543951872 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042179538615847546, + "loss": 3.085, + "theoretical_loss": 3.879696716978757, + "tokens_seen": 544017408 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042178535606820464, + "loss": 3.0669, + "theoretical_loss": 3.8796471097686642, + "tokens_seen": 544082944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004217753259779338, + "loss": 2.8497, + "theoretical_loss": 3.8795975102063647, + "tokens_seen": 544148480 + }, + { + "epoch": 1.06, + "learning_rate": 0.000421765295887663, + "loss": 2.9425, + "theoretical_loss": 3.8795479182897576, + "tokens_seen": 544214016 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004217552657973922, + "loss": 2.9348, + "theoretical_loss": 3.879498334016745, + "tokens_seen": 544279552 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042174523570712136, + "loss": 3.0157, + "theoretical_loss": 3.8794487573852274, + "tokens_seen": 544345088 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004217352056168506, + "loss": 3.0763, + "theoretical_loss": 3.8793991883931085, + "tokens_seen": 544410624 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004217251755265797, + "loss": 3.1068, + "theoretical_loss": 3.8793496270382906, + "tokens_seen": 544476160 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042171514543630896, + "loss": 3.1988, + "theoretical_loss": 3.879300073318679, + "tokens_seen": 544541696 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004217051153460381, + "loss": 3.0016, + "theoretical_loss": 3.879250527232178, + "tokens_seen": 544607232 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216950852557673, + "loss": 2.9387, + "theoretical_loss": 3.8792009887766934, + "tokens_seen": 544672768 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216850551654965, + "loss": 3.0541, + "theoretical_loss": 3.8791514579501323, + "tokens_seen": 544738304 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216750250752257, + "loss": 2.831, + "theoretical_loss": 3.8791019347504028, + "tokens_seen": 544803840 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042166499498495487, + "loss": 3.0506, + "theoretical_loss": 3.8790524191754123, + "tokens_seen": 544869376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216549648946841, + "loss": 3.0333, + "theoretical_loss": 3.879002911223071, + "tokens_seen": 544934912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042164493480441323, + "loss": 2.7341, + "theoretical_loss": 3.8789534108912873, + "tokens_seen": 545000448 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042163490471414246, + "loss": 2.9682, + "theoretical_loss": 3.878903918177974, + "tokens_seen": 545065984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216248746238716, + "loss": 3.0932, + "theoretical_loss": 3.8788544330810417, + "tokens_seen": 545131520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004216148445336008, + "loss": 3.116, + "theoretical_loss": 3.8788049555984023, + "tokens_seen": 545197056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042160481444333, + "loss": 3.0179, + "theoretical_loss": 3.8787554857279702, + "tokens_seen": 545262592 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004215947843530592, + "loss": 3.0275, + "theoretical_loss": 3.878706023467659, + "tokens_seen": 545328128 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042158475426278837, + "loss": 2.9807, + "theoretical_loss": 3.8786565688153845, + "tokens_seen": 545393664 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042157472417251755, + "loss": 3.2152, + "theoretical_loss": 3.8786071217690608, + "tokens_seen": 545459200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042156469408224673, + "loss": 3.0834, + "theoretical_loss": 3.8785576823266057, + "tokens_seen": 545524736 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 892417, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.12251615524292, + "objective/train/theoretical_loss": 3.8785206077335506, + "objective/train/tokens_used": 566033888, + "theoretical_loss": 3.8785206077335506, + "tokens_seen": 545573888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042155466399197597, + "loss": 2.989, + "theoretical_loss": 3.878508250485937, + "tokens_seen": 545590272 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004215446339017051, + "loss": 3.1183, + "theoretical_loss": 3.878458826244972, + "tokens_seen": 545655808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042153460381143433, + "loss": 3.0006, + "theoretical_loss": 3.87840940960163, + "tokens_seen": 545721344 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004215245737211635, + "loss": 3.048, + "theoretical_loss": 3.8783600005538306, + "tokens_seen": 545786880 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004215145436308927, + "loss": 3.0854, + "theoretical_loss": 3.8783105990994957, + "tokens_seen": 545852416 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004215045135406219, + "loss": 3.1335, + "theoretical_loss": 3.8782612052365457, + "tokens_seen": 545917952 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042149448345035105, + "loss": 3.1244, + "theoretical_loss": 3.878211818962903, + "tokens_seen": 545983488 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042148445336008024, + "loss": 3.134, + "theoretical_loss": 3.878162440276491, + "tokens_seen": 546049024 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042147442326980947, + "loss": 3.0755, + "theoretical_loss": 3.878113069175234, + "tokens_seen": 546114560 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004214643931795386, + "loss": 2.9923, + "theoretical_loss": 3.8780637056570564, + "tokens_seen": 546180096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042145436308926783, + "loss": 3.0844, + "theoretical_loss": 3.8780143497198836, + "tokens_seen": 546245632 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042144433299899696, + "loss": 2.9078, + "theoretical_loss": 3.8779650013616425, + "tokens_seen": 546311168 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004214343029087262, + "loss": 2.9578, + "theoretical_loss": 3.8779156605802596, + "tokens_seen": 546376704 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004214242728184554, + "loss": 3.0297, + "theoretical_loss": 3.877866327373664, + "tokens_seen": 546442240 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042141424272818456, + "loss": 3.0001, + "theoretical_loss": 3.877817001739784, + "tokens_seen": 546507776 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042140421263791374, + "loss": 3.0658, + "theoretical_loss": 3.8777676836765496, + "tokens_seen": 546573312 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213941825476429, + "loss": 2.9052, + "theoretical_loss": 3.8777183731818905, + "tokens_seen": 546638848 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213841524573721, + "loss": 3.2106, + "theoretical_loss": 3.877669070253739, + "tokens_seen": 546704384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042137412236710134, + "loss": 2.9036, + "theoretical_loss": 3.877619774890026, + "tokens_seen": 546769920 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042136409227683046, + "loss": 3.1402, + "theoretical_loss": 3.877570487088686, + "tokens_seen": 546835456 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213540621865597, + "loss": 3.1, + "theoretical_loss": 3.8775212068476517, + "tokens_seen": 546900992 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213440320962889, + "loss": 3.1815, + "theoretical_loss": 3.8774719341648574, + "tokens_seen": 546966528 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042133400200601806, + "loss": 3.0783, + "theoretical_loss": 3.87742266903824, + "tokens_seen": 547032064 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042132397191574724, + "loss": 3.0006, + "theoretical_loss": 3.877373411465734, + "tokens_seen": 547097600 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213139418254764, + "loss": 3.1774, + "theoretical_loss": 3.8773241614452774, + "tokens_seen": 547163136 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 895323, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3490610122680664, + "objective/train/theoretical_loss": 3.877287228884726, + "objective/train/tokens_used": 567672288, + "theoretical_loss": 3.877287228884726, + "tokens_seen": 547212288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004213039117352056, + "loss": 3.1464, + "theoretical_loss": 3.8772749189748072, + "tokens_seen": 547228672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042129388164493484, + "loss": 2.766, + "theoretical_loss": 3.877225684052263, + "tokens_seen": 547294208 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042128385155466397, + "loss": 3.203, + "theoretical_loss": 3.877176456675583, + "tokens_seen": 547359744 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004212738214643932, + "loss": 3.1049, + "theoretical_loss": 3.8771272368427088, + "tokens_seen": 547425280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042126379137412233, + "loss": 3.0727, + "theoretical_loss": 3.877078024551581, + "tokens_seen": 547490816 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042125376128385156, + "loss": 3.0513, + "theoretical_loss": 3.877028819800141, + "tokens_seen": 547556352 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042124373119358075, + "loss": 3.1606, + "theoretical_loss": 3.8769796225863313, + "tokens_seen": 547621888 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004212337011033099, + "loss": 3.1707, + "theoretical_loss": 3.876930432908096, + "tokens_seen": 547687424 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004212236710130391, + "loss": 2.9551, + "theoretical_loss": 3.876881250763379, + "tokens_seen": 547752960 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004212136409227683, + "loss": 3.0413, + "theoretical_loss": 3.876832076150126, + "tokens_seen": 547818496 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042120361083249747, + "loss": 2.961, + "theoretical_loss": 3.876782909066282, + "tokens_seen": 547884032 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004211935807422267, + "loss": 3.0082, + "theoretical_loss": 3.8767337495097944, + "tokens_seen": 547949568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042118355065195583, + "loss": 2.9422, + "theoretical_loss": 3.87668459747861, + "tokens_seen": 548015104 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042117352056168507, + "loss": 2.8972, + "theoretical_loss": 3.876635452970678, + "tokens_seen": 548080640 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004211634904714143, + "loss": 3.1537, + "theoretical_loss": 3.876586315983947, + "tokens_seen": 548146176 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042115346038114343, + "loss": 3.0474, + "theoretical_loss": 3.8765371865163667, + "tokens_seen": 548211712 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042114343029087266, + "loss": 3.0601, + "theoretical_loss": 3.876488064565888, + "tokens_seen": 548277248 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004211334002006018, + "loss": 2.9627, + "theoretical_loss": 3.8764389501304626, + "tokens_seen": 548342784 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042112337011033103, + "loss": 3.0256, + "theoretical_loss": 3.8763898432080426, + "tokens_seen": 548408320 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004211133400200602, + "loss": 3.1087, + "theoretical_loss": 3.8763407437965816, + "tokens_seen": 548473856 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004211033099297894, + "loss": 3.109, + "theoretical_loss": 3.8762916518940327, + "tokens_seen": 548539392 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042109327983951857, + "loss": 2.8927, + "theoretical_loss": 3.876242567498351, + "tokens_seen": 548604928 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042108324974924775, + "loss": 3.0534, + "theoretical_loss": 3.876193490607492, + "tokens_seen": 548670464 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042107321965897693, + "loss": 3.1555, + "theoretical_loss": 3.876144421219413, + "tokens_seen": 548736000 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042106318956870617, + "loss": 3.1299, + "theoretical_loss": 3.8760953593320693, + "tokens_seen": 548801536 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 897788, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.212775945663452, + "objective/train/theoretical_loss": 3.876058567837692, + "objective/train/tokens_used": 569310688, + "theoretical_loss": 3.876058567837692, + "tokens_seen": 548850688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004210531594784353, + "loss": 3.1615, + "theoretical_loss": 3.8760463049434204, + "tokens_seen": 548867072 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042104312938816453, + "loss": 2.9875, + "theoretical_loss": 3.8759972580514246, + "tokens_seen": 548932608 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004210330992978937, + "loss": 3.1278, + "theoretical_loss": 3.875948218654041, + "tokens_seen": 548998144 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004210230692076229, + "loss": 3.0488, + "theoretical_loss": 3.87589918674923, + "tokens_seen": 549063680 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004210130391173521, + "loss": 3.1055, + "theoretical_loss": 3.8758501623349533, + "tokens_seen": 549129216 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042100300902708125, + "loss": 2.943, + "theoretical_loss": 3.875801145409172, + "tokens_seen": 549194752 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042099297893681044, + "loss": 3.1174, + "theoretical_loss": 3.875752135969849, + "tokens_seen": 549260288 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042098294884653967, + "loss": 2.9175, + "theoretical_loss": 3.875703134014949, + "tokens_seen": 549325824 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004209729187562688, + "loss": 2.88, + "theoretical_loss": 3.875654139542435, + "tokens_seen": 549391360 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042096288866599803, + "loss": 3.0871, + "theoretical_loss": 3.875605152550272, + "tokens_seen": 549456896 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042095285857572716, + "loss": 3.0007, + "theoretical_loss": 3.8755561730364274, + "tokens_seen": 549522432 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004209428284854564, + "loss": 3.0197, + "theoretical_loss": 3.8755072009988663, + "tokens_seen": 549587968 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004209327983951856, + "loss": 3.0147, + "theoretical_loss": 3.875458236435557, + "tokens_seen": 549653504 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042092276830491476, + "loss": 3.1531, + "theoretical_loss": 3.875409279344468, + "tokens_seen": 549719040 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042091273821464394, + "loss": 3.0841, + "theoretical_loss": 3.8753603297235673, + "tokens_seen": 549784576 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004209027081243731, + "loss": 3.0927, + "theoretical_loss": 3.875311387570826, + "tokens_seen": 549850112 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004208926780341023, + "loss": 2.9554, + "theoretical_loss": 3.875262452884215, + "tokens_seen": 549915648 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042088264794383154, + "loss": 3.1243, + "theoretical_loss": 3.875213525661704, + "tokens_seen": 549981184 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042087261785356066, + "loss": 3.0046, + "theoretical_loss": 3.8751646059012663, + "tokens_seen": 550046720 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004208625877632899, + "loss": 3.079, + "theoretical_loss": 3.8751156936008755, + "tokens_seen": 550112256 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004208525576730191, + "loss": 2.961, + "theoretical_loss": 3.875066788758505, + "tokens_seen": 550177792 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042084252758274826, + "loss": 2.8198, + "theoretical_loss": 3.8750178913721287, + "tokens_seen": 550243328 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042083249749247744, + "loss": 3.0803, + "theoretical_loss": 3.874969001439723, + "tokens_seen": 550308864 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004208224674022066, + "loss": 3.0645, + "theoretical_loss": 3.8749201189592646, + "tokens_seen": 550374400 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004208124373119358, + "loss": 3.1021, + "theoretical_loss": 3.8748712439287294, + "tokens_seen": 550439936 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 900596, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3852930068969727, + "objective/train/theoretical_loss": 3.8748345925436234, + "objective/train/tokens_used": 570949088, + "theoretical_loss": 3.8748345925436234, + "tokens_seen": 550489088 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042080240722166504, + "loss": 3.1029, + "theoretical_loss": 3.8748223763460947, + "tokens_seen": 550505472 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042079237713139417, + "loss": 2.9437, + "theoretical_loss": 3.8747735162093404, + "tokens_seen": 550571008 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004207823470411234, + "loss": 3.1637, + "theoretical_loss": 3.874724663516446, + "tokens_seen": 550636544 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042077231695085253, + "loss": 3.0301, + "theoretical_loss": 3.8746758182653904, + "tokens_seen": 550702080 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042076228686058176, + "loss": 3.0286, + "theoretical_loss": 3.8746269804541558, + "tokens_seen": 550767616 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042075225677031095, + "loss": 2.9413, + "theoretical_loss": 3.8745781500807235, + "tokens_seen": 550833152 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004207422266800401, + "loss": 3.0915, + "theoretical_loss": 3.874529327143075, + "tokens_seen": 550898688 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004207321965897693, + "loss": 3.098, + "theoretical_loss": 3.874480511639195, + "tokens_seen": 550964224 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004207221664994985, + "loss": 3.0804, + "theoretical_loss": 3.8744317035670672, + "tokens_seen": 551029760 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042071213640922767, + "loss": 2.916, + "theoretical_loss": 3.874382902924677, + "tokens_seen": 551095296 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004207021063189569, + "loss": 3.1728, + "theoretical_loss": 3.874334109710009, + "tokens_seen": 551160832 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042069207622868603, + "loss": 3.1299, + "theoretical_loss": 3.87428532392105, + "tokens_seen": 551226368 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042068204613841527, + "loss": 3.0909, + "theoretical_loss": 3.874236545555788, + "tokens_seen": 551291904 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042067201604814445, + "loss": 3.0511, + "theoretical_loss": 3.87418777461221, + "tokens_seen": 551357440 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042066198595787363, + "loss": 2.9813, + "theoretical_loss": 3.8741390110883054, + "tokens_seen": 551422976 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004206519558676028, + "loss": 3.3134, + "theoretical_loss": 3.8740902549820637, + "tokens_seen": 551488512 + }, + { + "epoch": 1.06, + "learning_rate": 0.000420641925777332, + "loss": 3.126, + "theoretical_loss": 3.8740415062914755, + "tokens_seen": 551554048 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004206318956870612, + "loss": 3.0145, + "theoretical_loss": 3.8739927650145316, + "tokens_seen": 551619584 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004206218655967904, + "loss": 3.0179, + "theoretical_loss": 3.8739440311492244, + "tokens_seen": 551685120 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042061183550651954, + "loss": 3.0774, + "theoretical_loss": 3.873895304693546, + "tokens_seen": 551750656 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042060180541624877, + "loss": 3.0938, + "theoretical_loss": 3.8738465856454907, + "tokens_seen": 551816192 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205917753259779, + "loss": 2.9336, + "theoretical_loss": 3.873797874003052, + "tokens_seen": 551881728 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042058174523570713, + "loss": 2.9425, + "theoretical_loss": 3.873749169764225, + "tokens_seen": 551947264 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205717151454363, + "loss": 2.845, + "theoretical_loss": 3.8737004729270064, + "tokens_seen": 552012800 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205616850551655, + "loss": 2.9487, + "theoretical_loss": 3.873651783489392, + "tokens_seen": 552078336 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 903502, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9161651134490967, + "objective/train/theoretical_loss": 3.8736152712659675, + "objective/train/tokens_used": 572587488, + "theoretical_loss": 3.8736152712659675, + "tokens_seen": 552127488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205516549648947, + "loss": 3.1243, + "theoretical_loss": 3.87360310144938, + "tokens_seen": 552143872 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205416248746239, + "loss": 2.9808, + "theoretical_loss": 3.873554426804968, + "tokens_seen": 552209408 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042053159478435304, + "loss": 2.9605, + "theoretical_loss": 3.8735057595541553, + "tokens_seen": 552274944 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205215646940823, + "loss": 3.2057, + "theoretical_loss": 3.8734570996949413, + "tokens_seen": 552340480 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004205115346038114, + "loss": 3.1399, + "theoretical_loss": 3.8734084472253274, + "tokens_seen": 552406016 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042050150451354064, + "loss": 3.159, + "theoretical_loss": 3.8733598021433133, + "tokens_seen": 552471552 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204914744232698, + "loss": 3.0944, + "theoretical_loss": 3.8733111644469025, + "tokens_seen": 552537088 + }, + { + "epoch": 1.06, + "learning_rate": 0.000420481444332999, + "loss": 3.0675, + "theoretical_loss": 3.8732625341340974, + "tokens_seen": 552602624 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204714142427282, + "loss": 3.1113, + "theoretical_loss": 3.8732139112029014, + "tokens_seen": 552668160 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042046138415245736, + "loss": 3.1286, + "theoretical_loss": 3.8731652956513196, + "tokens_seen": 552733696 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042045135406218654, + "loss": 3.1072, + "theoretical_loss": 3.873116687477357, + "tokens_seen": 552799232 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204413239719158, + "loss": 3.2053, + "theoretical_loss": 3.873068086679019, + "tokens_seen": 552864768 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204312938816449, + "loss": 3.1269, + "theoretical_loss": 3.8730194932543127, + "tokens_seen": 552930304 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042042126379137414, + "loss": 3.0963, + "theoretical_loss": 3.8729709072012457, + "tokens_seen": 552995840 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204112337011033, + "loss": 2.9978, + "theoretical_loss": 3.8729223285178263, + "tokens_seen": 553061376 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004204012036108325, + "loss": 2.976, + "theoretical_loss": 3.872873757202064, + "tokens_seen": 553126912 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042039117352056174, + "loss": 3.1014, + "theoretical_loss": 3.872825193251967, + "tokens_seen": 553192448 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042038114343029086, + "loss": 3.056, + "theoretical_loss": 3.872776636665548, + "tokens_seen": 553257984 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004203711133400201, + "loss": 2.8981, + "theoretical_loss": 3.872728087440817, + "tokens_seen": 553323520 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004203610832497493, + "loss": 2.905, + "theoretical_loss": 3.8726795455757874, + "tokens_seen": 553389056 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042035105315947846, + "loss": 3.0733, + "theoretical_loss": 3.8726310110684707, + "tokens_seen": 553454592 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042034102306920764, + "loss": 2.9574, + "theoretical_loss": 3.8725824839168816, + "tokens_seen": 553520128 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004203309929789368, + "loss": 3.1558, + "theoretical_loss": 3.872533964119034, + "tokens_seen": 553585664 + }, + { + "epoch": 1.06, + "learning_rate": 0.000420320962888666, + "loss": 3.1782, + "theoretical_loss": 3.8724854516729437, + "tokens_seen": 553651200 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042031093279839524, + "loss": 3.1466, + "theoretical_loss": 3.8724369465766264, + "tokens_seen": 553716736 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 904895, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0776209831237793, + "objective/train/theoretical_loss": 3.872400572576484, + "objective/train/tokens_used": 574225888, + "theoretical_loss": 3.872400572576484, + "tokens_seen": 553765888 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042030090270812437, + "loss": 3.1169, + "theoretical_loss": 3.872388448828099, + "tokens_seen": 553782272 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004202908726178536, + "loss": 3.0838, + "theoretical_loss": 3.872339958425379, + "tokens_seen": 553847808 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042028084252758273, + "loss": 3.279, + "theoretical_loss": 3.8722914753664845, + "tokens_seen": 553913344 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042027081243731196, + "loss": 3.1668, + "theoretical_loss": 3.8722429996494347, + "tokens_seen": 553978880 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042026078234704115, + "loss": 2.8138, + "theoretical_loss": 3.87219453127225, + "tokens_seen": 554044416 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004202507522567703, + "loss": 3.0224, + "theoretical_loss": 3.8721460702329504, + "tokens_seen": 554109952 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004202407221664995, + "loss": 3.0173, + "theoretical_loss": 3.8720976165295578, + "tokens_seen": 554175488 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004202306920762287, + "loss": 3.0067, + "theoretical_loss": 3.872049170160094, + "tokens_seen": 554241024 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042022066198595787, + "loss": 3.1669, + "theoretical_loss": 3.872000731122582, + "tokens_seen": 554306560 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004202106318956871, + "loss": 3.0055, + "theoretical_loss": 3.871952299415046, + "tokens_seen": 554372096 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042020060180541623, + "loss": 3.2296, + "theoretical_loss": 3.8719038750355095, + "tokens_seen": 554437632 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042019057171514547, + "loss": 3.0499, + "theoretical_loss": 3.8718554579819986, + "tokens_seen": 554503168 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042018054162487465, + "loss": 3.0181, + "theoretical_loss": 3.8718070482525384, + "tokens_seen": 554568704 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042017051153460383, + "loss": 2.9593, + "theoretical_loss": 3.8717586458451567, + "tokens_seen": 554634240 + }, + { + "epoch": 1.06, + "learning_rate": 0.000420160481444333, + "loss": 3.0677, + "theoretical_loss": 3.8717102507578804, + "tokens_seen": 554699776 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004201504513540622, + "loss": 3.2146, + "theoretical_loss": 3.8716618629887387, + "tokens_seen": 554765312 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004201404212637914, + "loss": 2.9575, + "theoretical_loss": 3.8716134825357593, + "tokens_seen": 554830848 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004201303911735206, + "loss": 2.9932, + "theoretical_loss": 3.8715651093969727, + "tokens_seen": 554896384 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042012036108324974, + "loss": 3.0142, + "theoretical_loss": 3.8715167435704094, + "tokens_seen": 554961920 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042011033099297897, + "loss": 2.9104, + "theoretical_loss": 3.871468385054101, + "tokens_seen": 555027456 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004201003009027081, + "loss": 3.2188, + "theoretical_loss": 3.8714200338460794, + "tokens_seen": 555092992 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042009027081243733, + "loss": 3.1484, + "theoretical_loss": 3.8713716899443775, + "tokens_seen": 555158528 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200802407221665, + "loss": 3.1043, + "theoretical_loss": 3.8713233533470293, + "tokens_seen": 555224064 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200702106318957, + "loss": 3.1147, + "theoretical_loss": 3.871275024052068, + "tokens_seen": 555289600 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200601805416249, + "loss": 2.9665, + "theoretical_loss": 3.87122670205753, + "tokens_seen": 555355136 + }, + { + "epoch": 1.06, + "objective/train/docs_used": 907748, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.337294340133667, + "objective/train/theoretical_loss": 3.8711904653513476, + "objective/train/tokens_used": 575864288, + "theoretical_loss": 3.8711904653513476, + "tokens_seen": 555404288 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200501504513541, + "loss": 3.2653, + "theoretical_loss": 3.871178387361451, + "tokens_seen": 555420672 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042004012036108324, + "loss": 3.1466, + "theoretical_loss": 3.8711300799618673, + "tokens_seen": 555486208 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200300902708125, + "loss": 2.9578, + "theoretical_loss": 3.8710817798568167, + "tokens_seen": 555551744 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004200200601805416, + "loss": 3.1799, + "theoretical_loss": 3.871033487044337, + "tokens_seen": 555617280 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042001003009027084, + "loss": 3.1151, + "theoretical_loss": 3.8709852015224673, + "tokens_seen": 555682816 + }, + { + "epoch": 1.06, + "learning_rate": 0.00042, + "loss": 3.032, + "theoretical_loss": 3.8709369232892477, + "tokens_seen": 555748352 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004199899699097292, + "loss": 3.0877, + "theoretical_loss": 3.8708886523427184, + "tokens_seen": 555813888 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004199799398194584, + "loss": 3.1005, + "theoretical_loss": 3.8708403886809206, + "tokens_seen": 555879424 + }, + { + "epoch": 1.06, + "learning_rate": 0.00041996990972918756, + "loss": 3.1034, + "theoretical_loss": 3.870792132301896, + "tokens_seen": 555944960 + }, + { + "epoch": 1.06, + "learning_rate": 0.00041995987963891674, + "loss": 3.0111, + "theoretical_loss": 3.870743883203688, + "tokens_seen": 556010496 + }, + { + "epoch": 1.06, + "learning_rate": 0.000419949849548646, + "loss": 3.1428, + "theoretical_loss": 3.87069564138434, + "tokens_seen": 556076032 + }, + { + "epoch": 1.06, + "learning_rate": 0.0004199398194583751, + "loss": 3.1532, + "theoretical_loss": 3.8706474068418952, + "tokens_seen": 556141568 + }, + { + "epoch": 1.06, + "learning_rate": 0.00041992978936810434, + "loss": 3.2356, + "theoretical_loss": 3.8705991795743997, + "tokens_seen": 556207104 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041991975927783347, + "loss": 3.0407, + "theoretical_loss": 3.8705509595798993, + "tokens_seen": 556272640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004199097291875627, + "loss": 3.0829, + "theoretical_loss": 3.87050274685644, + "tokens_seen": 556338176 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004198996990972919, + "loss": 3.0706, + "theoretical_loss": 3.8704545414020695, + "tokens_seen": 556403712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041988966900702106, + "loss": 2.933, + "theoretical_loss": 3.8704063432148357, + "tokens_seen": 556469248 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041987963891675025, + "loss": 3.0422, + "theoretical_loss": 3.870358152292787, + "tokens_seen": 556534784 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004198696088264795, + "loss": 3.0585, + "theoretical_loss": 3.8703099686339737, + "tokens_seen": 556600320 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004198595787362086, + "loss": 2.9948, + "theoretical_loss": 3.870261792236445, + "tokens_seen": 556665856 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041984954864593784, + "loss": 3.1083, + "theoretical_loss": 3.870213623098253, + "tokens_seen": 556731392 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041983951855566697, + "loss": 3.0579, + "theoretical_loss": 3.870165461217449, + "tokens_seen": 556796928 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004198294884653962, + "loss": 3.1067, + "theoretical_loss": 3.8701173065920864, + "tokens_seen": 556862464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004198194583751254, + "loss": 3.0942, + "theoretical_loss": 3.8700691592202165, + "tokens_seen": 556928000 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041980942828485457, + "loss": 3.1622, + "theoretical_loss": 3.8700210190998954, + "tokens_seen": 556993536 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 910258, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.130556583404541, + "objective/train/theoretical_loss": 3.8699849187673134, + "objective/train/tokens_used": 577502688, + "theoretical_loss": 3.8699849187673134, + "tokens_seen": 557042688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041979939819458375, + "loss": 3.1354, + "theoretical_loss": 3.8699728862291773, + "tokens_seen": 557059072 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041978936810431293, + "loss": 3.1401, + "theoretical_loss": 3.8699247606061173, + "tokens_seen": 557124608 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004197793380140421, + "loss": 3.0346, + "theoretical_loss": 3.869876642228772, + "tokens_seen": 557190144 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041976930792377135, + "loss": 3.0823, + "theoretical_loss": 3.869828531095199, + "tokens_seen": 557255680 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041975927783350047, + "loss": 3.033, + "theoretical_loss": 3.8697804272034553, + "tokens_seen": 557321216 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004197492477432297, + "loss": 3.0984, + "theoretical_loss": 3.8697323305515994, + "tokens_seen": 557386752 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041973921765295883, + "loss": 3.0218, + "theoretical_loss": 3.869684241137691, + "tokens_seen": 557452288 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041972918756268807, + "loss": 3.0976, + "theoretical_loss": 3.86963615895979, + "tokens_seen": 557517824 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041971915747241725, + "loss": 3.127, + "theoretical_loss": 3.8695880840159576, + "tokens_seen": 557583360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041970912738214643, + "loss": 3.2234, + "theoretical_loss": 3.8695400163042546, + "tokens_seen": 557648896 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196990972918756, + "loss": 3.078, + "theoretical_loss": 3.8694919558227445, + "tokens_seen": 557714432 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041968906720160485, + "loss": 2.9652, + "theoretical_loss": 3.869443902569489, + "tokens_seen": 557779968 + }, + { + "epoch": 1.07, + "learning_rate": 0.000419679037111334, + "loss": 3.2376, + "theoretical_loss": 3.8693958565425524, + "tokens_seen": 557845504 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196690070210632, + "loss": 2.9924, + "theoretical_loss": 3.8693478177399996, + "tokens_seen": 557911040 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196589769307924, + "loss": 3.141, + "theoretical_loss": 3.8692997861598952, + "tokens_seen": 557976576 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196489468405216, + "loss": 3.0505, + "theoretical_loss": 3.8692517618003057, + "tokens_seen": 558042112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196389167502508, + "loss": 3.1419, + "theoretical_loss": 3.8692037446592984, + "tokens_seen": 558107648 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041962888665997994, + "loss": 3.2067, + "theoretical_loss": 3.8691557347349397, + "tokens_seen": 558173184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041961885656970917, + "loss": 3.1966, + "theoretical_loss": 3.8691077320252982, + "tokens_seen": 558238720 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004196088264794383, + "loss": 3.0199, + "theoretical_loss": 3.8690597365284436, + "tokens_seen": 558304256 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041959879638916753, + "loss": 2.9831, + "theoretical_loss": 3.8690117482424444, + "tokens_seen": 558369792 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195887662988967, + "loss": 3.1679, + "theoretical_loss": 3.868963767165372, + "tokens_seen": 558435328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195787362086259, + "loss": 3.0623, + "theoretical_loss": 3.868915793295298, + "tokens_seen": 558500864 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195687061183551, + "loss": 3.2309, + "theoretical_loss": 3.868867826630293, + "tokens_seen": 558566400 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195586760280843, + "loss": 3.0252, + "theoretical_loss": 3.868819867168431, + "tokens_seen": 558631936 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 913164, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0529143810272217, + "objective/train/theoretical_loss": 3.8687839022979373, + "objective/train/tokens_used": 579141088, + "theoretical_loss": 3.8687839022979373, + "tokens_seen": 558681088 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041954864593781344, + "loss": 3.0871, + "theoretical_loss": 3.868771914907785, + "tokens_seen": 558697472 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195386158475427, + "loss": 2.9583, + "theoretical_loss": 3.868723969846429, + "tokens_seen": 558763008 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195285857572718, + "loss": 3.1235, + "theoretical_loss": 3.868676031982438, + "tokens_seen": 558828544 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041951855566700104, + "loss": 3.1749, + "theoretical_loss": 3.868628101313888, + "tokens_seen": 558894080 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004195085255767302, + "loss": 3.019, + "theoretical_loss": 3.8685801778388553, + "tokens_seen": 558959616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194984954864594, + "loss": 3.0291, + "theoretical_loss": 3.8685322615554165, + "tokens_seen": 559025152 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194884653961886, + "loss": 3.0996, + "theoretical_loss": 3.8684843524616506, + "tokens_seen": 559090688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041947843530591776, + "loss": 2.908, + "theoretical_loss": 3.868436450555635, + "tokens_seen": 559156224 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041946840521564694, + "loss": 2.9947, + "theoretical_loss": 3.86838855583545, + "tokens_seen": 559221760 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194583751253762, + "loss": 3.0028, + "theoretical_loss": 3.8683406682991754, + "tokens_seen": 559287296 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194483450351053, + "loss": 3.0572, + "theoretical_loss": 3.868292787944892, + "tokens_seen": 559352832 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041943831494483454, + "loss": 2.8831, + "theoretical_loss": 3.8682449147706817, + "tokens_seen": 559418368 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041942828485456367, + "loss": 3.1536, + "theoretical_loss": 3.8681970487746264, + "tokens_seen": 559483904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194182547642929, + "loss": 3.206, + "theoretical_loss": 3.8681491899548095, + "tokens_seen": 559549440 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004194082246740221, + "loss": 3.0414, + "theoretical_loss": 3.868101338309314, + "tokens_seen": 559614976 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041939819458375126, + "loss": 3.1596, + "theoretical_loss": 3.868053493836226, + "tokens_seen": 559680512 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041938816449348045, + "loss": 2.9754, + "theoretical_loss": 3.8680056565336294, + "tokens_seen": 559746048 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004193781344032097, + "loss": 3.0925, + "theoretical_loss": 3.8679578263996106, + "tokens_seen": 559811584 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004193681043129388, + "loss": 2.9218, + "theoretical_loss": 3.8679100034322564, + "tokens_seen": 559877120 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041935807422266804, + "loss": 3.1247, + "theoretical_loss": 3.8678621876296546, + "tokens_seen": 559942656 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041934804413239717, + "loss": 3.1195, + "theoretical_loss": 3.867814378989893, + "tokens_seen": 560008192 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004193380140421264, + "loss": 2.9905, + "theoretical_loss": 3.8677665775110603, + "tokens_seen": 560073728 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004193279839518556, + "loss": 3.134, + "theoretical_loss": 3.867718783191247, + "tokens_seen": 560139264 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041931795386158477, + "loss": 3.111, + "theoretical_loss": 3.8676709960285427, + "tokens_seen": 560204800 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041930792377131395, + "loss": 3.086, + "theoretical_loss": 3.8676232160210384, + "tokens_seen": 560270336 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 915957, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.470583200454712, + "objective/train/theoretical_loss": 3.867587385709863, + "objective/train/tokens_used": 580779488, + "theoretical_loss": 3.867587385709863, + "tokens_seen": 560319488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041929789368104313, + "loss": 3.2533, + "theoretical_loss": 3.867575443166827, + "tokens_seen": 560335872 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004192878635907723, + "loss": 2.9998, + "theoretical_loss": 3.8675276774640004, + "tokens_seen": 560401408 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041927783350050155, + "loss": 3.0662, + "theoretical_loss": 3.867479918910653, + "tokens_seen": 560466944 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041926780341023067, + "loss": 3.0912, + "theoretical_loss": 3.8674321675048766, + "tokens_seen": 560532480 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004192577733199599, + "loss": 2.949, + "theoretical_loss": 3.867384423244768, + "tokens_seen": 560598016 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041924774322968904, + "loss": 3.2618, + "theoretical_loss": 3.8673366861284224, + "tokens_seen": 560663552 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041923771313941827, + "loss": 3.0587, + "theoretical_loss": 3.8672889561539354, + "tokens_seen": 560729088 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041922768304914745, + "loss": 3.0442, + "theoretical_loss": 3.867241233319404, + "tokens_seen": 560794624 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041921765295887663, + "loss": 3.1142, + "theoretical_loss": 3.867193517622927, + "tokens_seen": 560860160 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004192076228686058, + "loss": 3.093, + "theoretical_loss": 3.8671458090626016, + "tokens_seen": 560925696 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041919759277833505, + "loss": 2.9922, + "theoretical_loss": 3.867098107636528, + "tokens_seen": 560991232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191875626880642, + "loss": 2.9877, + "theoretical_loss": 3.867050413342805, + "tokens_seen": 561056768 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191775325977934, + "loss": 3.2314, + "theoretical_loss": 3.8670027261795346, + "tokens_seen": 561122304 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041916750250752254, + "loss": 3.2452, + "theoretical_loss": 3.8669550461448168, + "tokens_seen": 561187840 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191574724172518, + "loss": 2.9925, + "theoretical_loss": 3.8669073732367547, + "tokens_seen": 561253376 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041914744232698095, + "loss": 3.147, + "theoretical_loss": 3.8668597074534508, + "tokens_seen": 561318912 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041913741223671014, + "loss": 3.1627, + "theoretical_loss": 3.866812048793008, + "tokens_seen": 561384448 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191273821464393, + "loss": 3.1278, + "theoretical_loss": 3.8667643972535313, + "tokens_seen": 561449984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191173520561685, + "loss": 3.2179, + "theoretical_loss": 3.866716752833126, + "tokens_seen": 561515520 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004191073219658977, + "loss": 2.9501, + "theoretical_loss": 3.866669115529897, + "tokens_seen": 561581056 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190972918756269, + "loss": 3.0843, + "theoretical_loss": 3.866621485341952, + "tokens_seen": 561646592 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041908726178535604, + "loss": 3.1805, + "theoretical_loss": 3.866573862267396, + "tokens_seen": 561712128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190772316950853, + "loss": 2.9786, + "theoretical_loss": 3.8665262463043386, + "tokens_seen": 561777664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190672016048144, + "loss": 3.0108, + "theoretical_loss": 3.866478637450888, + "tokens_seen": 561843200 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041905717151454364, + "loss": 2.9879, + "theoretical_loss": 3.866431035705154, + "tokens_seen": 561908736 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 918788, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.934800386428833, + "objective/train/theoretical_loss": 3.8663953390591543, + "objective/train/tokens_used": 582417888, + "theoretical_loss": 3.8663953390591543, + "tokens_seen": 561957888 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190471414242728, + "loss": 3.074, + "theoretical_loss": 3.8663834410652456, + "tokens_seen": 561974272 + }, + { + "epoch": 1.07, + "learning_rate": 0.000419037111334002, + "loss": 2.9406, + "theoretical_loss": 3.8663358535292742, + "tokens_seen": 562039808 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190270812437312, + "loss": 3.0693, + "theoretical_loss": 3.8662882730953516, + "tokens_seen": 562105344 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004190170511534604, + "loss": 2.972, + "theoretical_loss": 3.8662406997615895, + "tokens_seen": 562170880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041900702106318954, + "loss": 3.1168, + "theoretical_loss": 3.8661931335261013, + "tokens_seen": 562236416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189969909729188, + "loss": 3.1351, + "theoretical_loss": 3.866145574387, + "tokens_seen": 562301952 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189869608826479, + "loss": 3.0108, + "theoretical_loss": 3.866098022342401, + "tokens_seen": 562367488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041897693079237714, + "loss": 3.202, + "theoretical_loss": 3.8660504773904183, + "tokens_seen": 562433024 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189669007021063, + "loss": 3.1045, + "theoretical_loss": 3.866002939529169, + "tokens_seen": 562498560 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189568706118355, + "loss": 3.1197, + "theoretical_loss": 3.8659554087567685, + "tokens_seen": 562564096 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189468405215647, + "loss": 2.9884, + "theoretical_loss": 3.8659078850713353, + "tokens_seen": 562629632 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041893681043129387, + "loss": 3.1069, + "theoretical_loss": 3.865860368470986, + "tokens_seen": 562695168 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041892678034102305, + "loss": 3.0204, + "theoretical_loss": 3.8658128589538396, + "tokens_seen": 562760704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004189167502507523, + "loss": 3.0394, + "theoretical_loss": 3.8657653565180166, + "tokens_seen": 562826240 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041890672016048146, + "loss": 3.0474, + "theoretical_loss": 3.8657178611616363, + "tokens_seen": 562891776 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041889669007021065, + "loss": 3.033, + "theoretical_loss": 3.8656703728828194, + "tokens_seen": 562957312 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004188866599799399, + "loss": 3.1477, + "theoretical_loss": 3.8656228916796884, + "tokens_seen": 563022848 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418876629889669, + "loss": 3.1831, + "theoretical_loss": 3.8655754175503647, + "tokens_seen": 563088384 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041886659979939824, + "loss": 3.109, + "theoretical_loss": 3.8655279504929716, + "tokens_seen": 563153920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041885656970912737, + "loss": 3.0026, + "theoretical_loss": 3.8654804905056332, + "tokens_seen": 563219456 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004188465396188566, + "loss": 3.0392, + "theoretical_loss": 3.865433037586473, + "tokens_seen": 563284992 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004188365095285858, + "loss": 2.9039, + "theoretical_loss": 3.8653855917336175, + "tokens_seen": 563350528 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041882647943831497, + "loss": 3.014, + "theoretical_loss": 3.865338152945191, + "tokens_seen": 563416064 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041881644934804415, + "loss": 3.1656, + "theoretical_loss": 3.865290721219322, + "tokens_seen": 563481600 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041880641925777333, + "loss": 3.0503, + "theoretical_loss": 3.8652432965541363, + "tokens_seen": 563547136 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 921621, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3773038387298584, + "objective/train/theoretical_loss": 3.865207732687695, + "objective/train/tokens_used": 584056288, + "theoretical_loss": 3.865207732687695, + "tokens_seen": 563596288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004187963891675025, + "loss": 3.2218, + "theoretical_loss": 3.8651958789477625, + "tokens_seen": 563612672 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041878635907723175, + "loss": 3.0694, + "theoretical_loss": 3.8651484683983295, + "tokens_seen": 563678208 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004187763289869609, + "loss": 3.1783, + "theoretical_loss": 3.8651010649039663, + "tokens_seen": 563743744 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004187662988966901, + "loss": 3.0644, + "theoretical_loss": 3.8650536684628034, + "tokens_seen": 563809280 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041875626880641924, + "loss": 2.9545, + "theoretical_loss": 3.8650062790729716, + "tokens_seen": 563874816 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041874623871614847, + "loss": 3.0066, + "theoretical_loss": 3.8649588967326025, + "tokens_seen": 563940352 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041873620862587765, + "loss": 3.0785, + "theoretical_loss": 3.8649115214398284, + "tokens_seen": 564005888 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041872617853560683, + "loss": 3.0692, + "theoretical_loss": 3.864864153192782, + "tokens_seen": 564071424 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418716148445336, + "loss": 2.8544, + "theoretical_loss": 3.8648167919895977, + "tokens_seen": 564136960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041870611835506525, + "loss": 2.9994, + "theoretical_loss": 3.86476943782841, + "tokens_seen": 564202496 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186960882647944, + "loss": 2.9848, + "theoretical_loss": 3.864722090707353, + "tokens_seen": 564268032 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186860581745236, + "loss": 3.0664, + "theoretical_loss": 3.8646747506245633, + "tokens_seen": 564333568 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041867602808425274, + "loss": 3.1127, + "theoretical_loss": 3.8646274175781774, + "tokens_seen": 564399104 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418665997993982, + "loss": 3.0956, + "theoretical_loss": 3.8645800915663324, + "tokens_seen": 564464640 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041865596790371115, + "loss": 3.0535, + "theoretical_loss": 3.864532772587167, + "tokens_seen": 564530176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041864593781344034, + "loss": 3.1583, + "theoretical_loss": 3.864485460638819, + "tokens_seen": 564595712 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186359077231695, + "loss": 3.1568, + "theoretical_loss": 3.8644381557194283, + "tokens_seen": 564661248 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186258776328987, + "loss": 3.0656, + "theoretical_loss": 3.864390857827135, + "tokens_seen": 564726784 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186158475426279, + "loss": 3.0482, + "theoretical_loss": 3.864343566960079, + "tokens_seen": 564792320 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004186058174523571, + "loss": 2.8451, + "theoretical_loss": 3.8642962831164036, + "tokens_seen": 564857856 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041859578736208624, + "loss": 2.9826, + "theoretical_loss": 3.8642490062942496, + "tokens_seen": 564923392 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004185857572718155, + "loss": 3.1552, + "theoretical_loss": 3.8642017364917605, + "tokens_seen": 564988928 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004185757271815446, + "loss": 2.986, + "theoretical_loss": 3.86415447370708, + "tokens_seen": 565054464 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041856569709127384, + "loss": 2.9196, + "theoretical_loss": 3.8641072179383524, + "tokens_seen": 565120000 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418555667001003, + "loss": 3.1075, + "theoretical_loss": 3.8640599691837227, + "tokens_seen": 565185536 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 922982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9851155281066895, + "objective/train/theoretical_loss": 3.8640245372196365, + "objective/train/tokens_used": 585694688, + "theoretical_loss": 3.8640245372196365, + "tokens_seen": 565234688 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004185456369107322, + "loss": 3.0251, + "theoretical_loss": 3.8640127274413363, + "tokens_seen": 565251072 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004185356068204614, + "loss": 3.0798, + "theoretical_loss": 3.8639654927093403, + "tokens_seen": 565316608 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004185255767301906, + "loss": 2.8526, + "theoretical_loss": 3.8639182649858816, + "tokens_seen": 565382144 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041851554663991974, + "loss": 3.116, + "theoretical_loss": 3.863871044269108, + "tokens_seen": 565447680 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418505516549649, + "loss": 3.0062, + "theoretical_loss": 3.863823830557169, + "tokens_seen": 565513216 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184954864593781, + "loss": 3.0891, + "theoretical_loss": 3.8637766238482123, + "tokens_seen": 565578752 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041848545636910734, + "loss": 2.9966, + "theoretical_loss": 3.863729424140389, + "tokens_seen": 565644288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184754262788365, + "loss": 3.1879, + "theoretical_loss": 3.863682231431849, + "tokens_seen": 565709824 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184653961885657, + "loss": 3.0122, + "theoretical_loss": 3.863635045720745, + "tokens_seen": 565775360 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184553660982949, + "loss": 3.0104, + "theoretical_loss": 3.863587867005228, + "tokens_seen": 565840896 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041844533600802407, + "loss": 3.0536, + "theoretical_loss": 3.863540695283451, + "tokens_seen": 565906432 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041843530591775325, + "loss": 3.0618, + "theoretical_loss": 3.8634935305535674, + "tokens_seen": 565971968 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184252758274825, + "loss": 2.926, + "theoretical_loss": 3.8634463728137316, + "tokens_seen": 566037504 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004184152457372116, + "loss": 3.0283, + "theoretical_loss": 3.863399222062099, + "tokens_seen": 566103040 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041840521564694085, + "loss": 3.1723, + "theoretical_loss": 3.863352078296825, + "tokens_seen": 566168576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041839518555666997, + "loss": 3.0397, + "theoretical_loss": 3.863304941516065, + "tokens_seen": 566234112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004183851554663992, + "loss": 3.0588, + "theoretical_loss": 3.863257811717977, + "tokens_seen": 566299648 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004183751253761284, + "loss": 3.0036, + "theoretical_loss": 3.8632106889007183, + "tokens_seen": 566365184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041836509528585757, + "loss": 2.9142, + "theoretical_loss": 3.8631635730624474, + "tokens_seen": 566430720 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041835506519558675, + "loss": 3.1564, + "theoretical_loss": 3.8631164642013234, + "tokens_seen": 566496256 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418345035105316, + "loss": 3.1191, + "theoretical_loss": 3.863069362315506, + "tokens_seen": 566561792 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004183350050150451, + "loss": 2.8815, + "theoretical_loss": 3.863022267403156, + "tokens_seen": 566627328 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041832497492477435, + "loss": 2.985, + "theoretical_loss": 3.8629751794624343, + "tokens_seen": 566692864 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004183149448345035, + "loss": 3.133, + "theoretical_loss": 3.862928098491503, + "tokens_seen": 566758400 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004183049147442327, + "loss": 3.003, + "theoretical_loss": 3.8628810244885248, + "tokens_seen": 566823936 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 925571, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.269139051437378, + "objective/train/theoretical_loss": 3.8628457235579052, + "objective/train/tokens_used": 587333088, + "theoretical_loss": 3.8628457235579052, + "tokens_seen": 566873088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182948846539619, + "loss": 3.0759, + "theoretical_loss": 3.862833957451662, + "tokens_seen": 566889472 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182848545636911, + "loss": 3.0797, + "theoretical_loss": 3.86278689737908, + "tokens_seen": 566955008 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041827482447342025, + "loss": 3.0723, + "theoretical_loss": 3.862739844268943, + "tokens_seen": 567020544 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041826479438314944, + "loss": 3.1146, + "theoretical_loss": 3.8626927981194163, + "tokens_seen": 567086080 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182547642928786, + "loss": 3.1014, + "theoretical_loss": 3.862645758928666, + "tokens_seen": 567151616 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041824473420260785, + "loss": 3.0208, + "theoretical_loss": 3.8625987266948583, + "tokens_seen": 567217152 + }, + { + "epoch": 1.07, + "learning_rate": 0.000418234704112337, + "loss": 3.138, + "theoretical_loss": 3.8625517014161614, + "tokens_seen": 567282688 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182246740220662, + "loss": 2.968, + "theoretical_loss": 3.8625046830907435, + "tokens_seen": 567348224 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182146439317954, + "loss": 3.0609, + "theoretical_loss": 3.862457671716773, + "tokens_seen": 567413760 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004182046138415246, + "loss": 2.8549, + "theoretical_loss": 3.862410667292419, + "tokens_seen": 567479296 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041819458375125376, + "loss": 3.0389, + "theoretical_loss": 3.8623636698158537, + "tokens_seen": 567544832 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041818455366098294, + "loss": 2.8975, + "theoretical_loss": 3.8623166792852457, + "tokens_seen": 567610368 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004181745235707121, + "loss": 3.0791, + "theoretical_loss": 3.8622696956987683, + "tokens_seen": 567675904 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041816449348044136, + "loss": 3.1002, + "theoretical_loss": 3.862222719054593, + "tokens_seen": 567741440 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041815446339017054, + "loss": 3.1328, + "theoretical_loss": 3.862175749350893, + "tokens_seen": 567806976 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004181444332998997, + "loss": 3.1679, + "theoretical_loss": 3.862128786585841, + "tokens_seen": 567872512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004181344032096289, + "loss": 3.1138, + "theoretical_loss": 3.8620818307576137, + "tokens_seen": 567938048 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004181243731193581, + "loss": 3.0429, + "theoretical_loss": 3.8620348818643846, + "tokens_seen": 568003584 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004181143430290873, + "loss": 3.1078, + "theoretical_loss": 3.8619879399043295, + "tokens_seen": 568069120 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041810431293881644, + "loss": 2.8976, + "theoretical_loss": 3.861941004875625, + "tokens_seen": 568134656 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180942828485457, + "loss": 3.0287, + "theoretical_loss": 3.8618940767764487, + "tokens_seen": 568200192 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180842527582748, + "loss": 3.0647, + "theoretical_loss": 3.861847155604978, + "tokens_seen": 568265728 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041807422266800404, + "loss": 3.1022, + "theoretical_loss": 3.8618002413593917, + "tokens_seen": 568331264 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180641925777332, + "loss": 2.7438, + "theoretical_loss": 3.8617533340378687, + "tokens_seen": 568396800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180541624874624, + "loss": 2.9471, + "theoretical_loss": 3.861706433638589, + "tokens_seen": 568462336 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 928352, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0806820392608643, + "objective/train/theoretical_loss": 3.8616712628807575, + "objective/train/tokens_used": 588971488, + "theoretical_loss": 3.8616712628807575, + "tokens_seen": 568511488 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180441323971916, + "loss": 3.0077, + "theoretical_loss": 3.8616595401597333, + "tokens_seen": 568527872 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180341023069208, + "loss": 2.951, + "theoretical_loss": 3.861612653599483, + "tokens_seen": 568593408 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041802407221664994, + "loss": 3.0076, + "theoretical_loss": 3.8615657739560203, + "tokens_seen": 568658944 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180140421263792, + "loss": 3.01, + "theoretical_loss": 3.861518901227527, + "tokens_seen": 568724480 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004180040120361083, + "loss": 3.0089, + "theoretical_loss": 3.861472035412187, + "tokens_seen": 568790016 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041799398194583754, + "loss": 2.9847, + "theoretical_loss": 3.8614251765081846, + "tokens_seen": 568855552 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004179839518555667, + "loss": 3.0717, + "theoretical_loss": 3.861378324513704, + "tokens_seen": 568921088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004179739217652959, + "loss": 2.9127, + "theoretical_loss": 3.861331479426931, + "tokens_seen": 568986624 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004179638916750251, + "loss": 3.002, + "theoretical_loss": 3.8612846412460513, + "tokens_seen": 569052160 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041795386158475427, + "loss": 3.2024, + "theoretical_loss": 3.8612378099692526, + "tokens_seen": 569117696 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041794383149448345, + "loss": 3.1025, + "theoretical_loss": 3.861190985594721, + "tokens_seen": 569183232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004179338014042127, + "loss": 2.9636, + "theoretical_loss": 3.8611441681206453, + "tokens_seen": 569248768 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004179237713139418, + "loss": 3.0196, + "theoretical_loss": 3.8610973575452148, + "tokens_seen": 569314304 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041791374122367105, + "loss": 3.3067, + "theoretical_loss": 3.861050553866618, + "tokens_seen": 569379840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041790371113340017, + "loss": 2.8484, + "theoretical_loss": 3.861003757083046, + "tokens_seen": 569445376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178936810431294, + "loss": 2.9734, + "theoretical_loss": 3.8609569671926898, + "tokens_seen": 569510912 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178836509528586, + "loss": 3.0344, + "theoretical_loss": 3.86091018419374, + "tokens_seen": 569576448 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041787362086258777, + "loss": 3.1263, + "theoretical_loss": 3.8608634080843895, + "tokens_seen": 569641984 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041786359077231695, + "loss": 3.0503, + "theoretical_loss": 3.8608166388628313, + "tokens_seen": 569707520 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178535606820462, + "loss": 3.1596, + "theoretical_loss": 3.8607698765272582, + "tokens_seen": 569773056 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178435305917753, + "loss": 3.0713, + "theoretical_loss": 3.860723121075866, + "tokens_seen": 569838592 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041783350050150455, + "loss": 2.9113, + "theoretical_loss": 3.8606763725068483, + "tokens_seen": 569904128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178234704112337, + "loss": 3.1447, + "theoretical_loss": 3.8606296308184014, + "tokens_seen": 569969664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178134403209629, + "loss": 3.0639, + "theoretical_loss": 3.860582896008721, + "tokens_seen": 570035200 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004178034102306921, + "loss": 3.1707, + "theoretical_loss": 3.8605361680760053, + "tokens_seen": 570100736 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 931197, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8587732315063477, + "objective/train/theoretical_loss": 3.8605011266383915, + "objective/train/tokens_used": 590609888, + "theoretical_loss": 3.8605011266383915, + "tokens_seen": 570149888 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177933801404213, + "loss": 2.9355, + "theoretical_loss": 3.860489447018451, + "tokens_seen": 570166272 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041778335005015045, + "loss": 3.0494, + "theoretical_loss": 3.8604427328342563, + "tokens_seen": 570231808 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041777331995987964, + "loss": 2.9303, + "theoretical_loss": 3.8603960255216214, + "tokens_seen": 570297344 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177632898696088, + "loss": 3.0652, + "theoretical_loss": 3.860349325078745, + "tokens_seen": 570362880 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041775325977933805, + "loss": 3.0686, + "theoretical_loss": 3.8603026315038282, + "tokens_seen": 570428416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177432296890672, + "loss": 2.9216, + "theoretical_loss": 3.8602559447950715, + "tokens_seen": 570493952 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177331995987964, + "loss": 3.2599, + "theoretical_loss": 3.860209264950677, + "tokens_seen": 570559488 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177231695085256, + "loss": 3.0526, + "theoretical_loss": 3.860162591968847, + "tokens_seen": 570625024 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004177131394182548, + "loss": 3.1097, + "theoretical_loss": 3.8601159258477846, + "tokens_seen": 570690560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041770310932798396, + "loss": 3.0838, + "theoretical_loss": 3.860069266585694, + "tokens_seen": 570756096 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041769307923771314, + "loss": 3.1821, + "theoretical_loss": 3.86002261418078, + "tokens_seen": 570821632 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004176830491474423, + "loss": 3.071, + "theoretical_loss": 3.859975968631246, + "tokens_seen": 570887168 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041767301905717156, + "loss": 3.1288, + "theoretical_loss": 3.8599293299352992, + "tokens_seen": 570952704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004176629889669007, + "loss": 3.0315, + "theoretical_loss": 3.859882698091146, + "tokens_seen": 571018240 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004176529588766299, + "loss": 2.9203, + "theoretical_loss": 3.859836073096994, + "tokens_seen": 571083776 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041764292878635904, + "loss": 3.1349, + "theoretical_loss": 3.8597894549510494, + "tokens_seen": 571149312 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004176328986960883, + "loss": 3.1904, + "theoretical_loss": 3.8597428436515226, + "tokens_seen": 571214848 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041762286860581746, + "loss": 3.1754, + "theoretical_loss": 3.859696239196621, + "tokens_seen": 571280384 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041761283851554664, + "loss": 3.0469, + "theoretical_loss": 3.8596496415845567, + "tokens_seen": 571345920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004176028084252758, + "loss": 3.1447, + "theoretical_loss": 3.859603050813539, + "tokens_seen": 571411456 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417592778335005, + "loss": 3.0836, + "theoretical_loss": 3.8595564668817786, + "tokens_seen": 571476992 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175827482447342, + "loss": 3.1563, + "theoretical_loss": 3.8595098897874878, + "tokens_seen": 571542528 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175727181544634, + "loss": 3.0928, + "theoretical_loss": 3.8594633195288797, + "tokens_seen": 571608064 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041756268806419255, + "loss": 3.0357, + "theoretical_loss": 3.8594167561041672, + "tokens_seen": 571673600 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175526579739218, + "loss": 3.1297, + "theoretical_loss": 3.859370199511564, + "tokens_seen": 571739136 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 933924, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0719001293182373, + "objective/train/theoretical_loss": 3.8593352865496096, + "objective/train/tokens_used": 592248288, + "theoretical_loss": 3.8593352865496096, + "tokens_seen": 571788288 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041754262788365096, + "loss": 3.035, + "theoretical_loss": 3.8593236497492853, + "tokens_seen": 571804672 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041753259779338015, + "loss": 3.0114, + "theoretical_loss": 3.8592771068155454, + "tokens_seen": 571870208 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175225677031093, + "loss": 2.943, + "theoretical_loss": 3.8592305707085606, + "tokens_seen": 571935744 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175125376128385, + "loss": 3.0694, + "theoretical_loss": 3.859184041426548, + "tokens_seen": 572001280 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004175025075225677, + "loss": 3.054, + "theoretical_loss": 3.8591375189677244, + "tokens_seen": 572066816 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004174924774322969, + "loss": 3.0067, + "theoretical_loss": 3.859091003330308, + "tokens_seen": 572132352 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041748244734202605, + "loss": 3.0333, + "theoretical_loss": 3.859044494512517, + "tokens_seen": 572197888 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004174724172517553, + "loss": 3.0836, + "theoretical_loss": 3.8589979925125704, + "tokens_seen": 572263424 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004174623871614844, + "loss": 2.7874, + "theoretical_loss": 3.8589514973286896, + "tokens_seen": 572328960 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041745235707121365, + "loss": 3.1844, + "theoretical_loss": 3.858905008959094, + "tokens_seen": 572394496 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041744232698094283, + "loss": 3.2001, + "theoretical_loss": 3.8588585274020044, + "tokens_seen": 572460032 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417432296890672, + "loss": 3.1086, + "theoretical_loss": 3.8588120526556446, + "tokens_seen": 572525568 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004174222668004012, + "loss": 3.0442, + "theoretical_loss": 3.8587655847182356, + "tokens_seen": 572591104 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041741223671013037, + "loss": 2.9819, + "theoretical_loss": 3.858719123588001, + "tokens_seen": 572656640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004174022066198596, + "loss": 3.0749, + "theoretical_loss": 3.858672669263165, + "tokens_seen": 572722176 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173921765295888, + "loss": 3.1163, + "theoretical_loss": 3.858626221741952, + "tokens_seen": 572787712 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041738214643931797, + "loss": 2.962, + "theoretical_loss": 3.858579781022588, + "tokens_seen": 572853248 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041737211634904715, + "loss": 3.0971, + "theoretical_loss": 3.858533347103298, + "tokens_seen": 572918784 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173620862587764, + "loss": 3.1464, + "theoretical_loss": 3.858486919982309, + "tokens_seen": 572984320 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173520561685055, + "loss": 3.0606, + "theoretical_loss": 3.8584404996578483, + "tokens_seen": 573049856 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041734202607823475, + "loss": 2.8998, + "theoretical_loss": 3.858394086128144, + "tokens_seen": 573115392 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173319959879639, + "loss": 2.9732, + "theoretical_loss": 3.858347679391424, + "tokens_seen": 573180928 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173219658976931, + "loss": 2.947, + "theoretical_loss": 3.8583012794459184, + "tokens_seen": 573246464 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173119358074223, + "loss": 3.0532, + "theoretical_loss": 3.858254886289857, + "tokens_seen": 573312000 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004173019057171515, + "loss": 3.0734, + "theoretical_loss": 3.85820849992147, + "tokens_seen": 573377536 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 936713, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1334517002105713, + "objective/train/theoretical_loss": 3.858173714598527, + "objective/train/tokens_used": 593886688, + "theoretical_loss": 3.858173714598527, + "tokens_seen": 573426688 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041729187562688065, + "loss": 3.0283, + "theoretical_loss": 3.8581621203389886, + "tokens_seen": 573443072 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041728184553660984, + "loss": 2.9634, + "theoretical_loss": 3.858115747540645, + "tokens_seen": 573508608 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417271815446339, + "loss": 3.0946, + "theoretical_loss": 3.8580693815246727, + "tokens_seen": 573574144 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041726178535606825, + "loss": 3.0135, + "theoretical_loss": 3.8580230222893035, + "tokens_seen": 573639680 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004172517552657974, + "loss": 3.0206, + "theoretical_loss": 3.8579766698327718, + "tokens_seen": 573705216 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004172417251755266, + "loss": 3.0584, + "theoretical_loss": 3.8579303241533123, + "tokens_seen": 573770752 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004172316950852558, + "loss": 3.1552, + "theoretical_loss": 3.8578839852491598, + "tokens_seen": 573836288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004172316950852558, + "loss": 3.114, + "theoretical_loss": 3.857837653118551, + "tokens_seen": 573901824 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417221664994985, + "loss": 3.2095, + "theoretical_loss": 3.857791327759722, + "tokens_seen": 573967360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041721163490471416, + "loss": 3.0703, + "theoretical_loss": 3.85774500917091, + "tokens_seen": 574032896 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041720160481444334, + "loss": 3.1253, + "theoretical_loss": 3.8576986973503526, + "tokens_seen": 574098432 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004171915747241725, + "loss": 3.1997, + "theoretical_loss": 3.857652392296289, + "tokens_seen": 574163968 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041718154463390176, + "loss": 3.1192, + "theoretical_loss": 3.8576060940069583, + "tokens_seen": 574229504 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004171715145436309, + "loss": 3.0166, + "theoretical_loss": 3.8575598024805995, + "tokens_seen": 574295040 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004171614844533601, + "loss": 3.2216, + "theoretical_loss": 3.857513517715454, + "tokens_seen": 574360576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041715145436308924, + "loss": 3.0882, + "theoretical_loss": 3.857467239709763, + "tokens_seen": 574426112 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004171414242728185, + "loss": 3.0467, + "theoretical_loss": 3.8574209684617675, + "tokens_seen": 574491648 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041713139418254766, + "loss": 3.1618, + "theoretical_loss": 3.857374703969711, + "tokens_seen": 574557184 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041712136409227684, + "loss": 3.1, + "theoretical_loss": 3.857328446231836, + "tokens_seen": 574622720 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417111334002006, + "loss": 3.1712, + "theoretical_loss": 3.857282195246386, + "tokens_seen": 574688256 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004171013039117352, + "loss": 2.999, + "theoretical_loss": 3.8572359510116065, + "tokens_seen": 574753792 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170912738214644, + "loss": 2.9568, + "theoretical_loss": 3.8571897135257416, + "tokens_seen": 574819328 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170812437311936, + "loss": 3.0057, + "theoretical_loss": 3.857143482787038, + "tokens_seen": 574884864 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041707121364092275, + "loss": 3.0594, + "theoretical_loss": 3.8570972587937415, + "tokens_seen": 574950400 + }, + { + "epoch": 1.07, + "learning_rate": 0.000417061183550652, + "loss": 3.2226, + "theoretical_loss": 3.857051041544099, + "tokens_seen": 575015936 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 939080, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0695667266845703, + "objective/train/theoretical_loss": 3.8570163830313366, + "objective/train/tokens_used": 595525088, + "theoretical_loss": 3.8570163830313366, + "tokens_seen": 575065088 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041705115346038116, + "loss": 3.1113, + "theoretical_loss": 3.8570048310363596, + "tokens_seen": 575081472 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041704112337011035, + "loss": 2.9712, + "theoretical_loss": 3.8569586272687695, + "tokens_seen": 575147008 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170310932798395, + "loss": 3.0215, + "theoretical_loss": 3.8569124302395794, + "tokens_seen": 575212544 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170210631895687, + "loss": 3.2315, + "theoretical_loss": 3.8568662399470384, + "tokens_seen": 575278080 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170110330992979, + "loss": 3.1272, + "theoretical_loss": 3.856820056389397, + "tokens_seen": 575343616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004170010030090271, + "loss": 3.0746, + "theoretical_loss": 3.8567738795649062, + "tokens_seen": 575409152 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041699097291875625, + "loss": 3.2432, + "theoretical_loss": 3.8567277094718175, + "tokens_seen": 575474688 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004169809428284855, + "loss": 3.0044, + "theoretical_loss": 3.8566815461083843, + "tokens_seen": 575540224 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004169709127382146, + "loss": 3.1146, + "theoretical_loss": 3.8566353894728578, + "tokens_seen": 575605760 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041696088264794385, + "loss": 3.0856, + "theoretical_loss": 3.856589239563492, + "tokens_seen": 575671296 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041695085255767303, + "loss": 3.0063, + "theoretical_loss": 3.8565430963785428, + "tokens_seen": 575736832 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004169408224674022, + "loss": 3.0212, + "theoretical_loss": 3.8564969599162633, + "tokens_seen": 575802368 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004169307923771314, + "loss": 3.1169, + "theoretical_loss": 3.85645083017491, + "tokens_seen": 575867904 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004169207622868606, + "loss": 2.9288, + "theoretical_loss": 3.8564047071527385, + "tokens_seen": 575933440 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041691073219658975, + "loss": 3.0753, + "theoretical_loss": 3.8563585908480063, + "tokens_seen": 575998976 + }, + { + "epoch": 1.07, + "learning_rate": 0.000416900702106319, + "loss": 3.2409, + "theoretical_loss": 3.8563124812589704, + "tokens_seen": 576064512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168906720160481, + "loss": 3.027, + "theoretical_loss": 3.85626637838389, + "tokens_seen": 576130048 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041688064192577735, + "loss": 3.1043, + "theoretical_loss": 3.856220282221023, + "tokens_seen": 576195584 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041687061183550653, + "loss": 3.0514, + "theoretical_loss": 3.856174192768629, + "tokens_seen": 576261120 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168605817452357, + "loss": 3.059, + "theoretical_loss": 3.8561281100249674, + "tokens_seen": 576326656 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168505516549649, + "loss": 3.1048, + "theoretical_loss": 3.856082033988301, + "tokens_seen": 576392192 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168405215646941, + "loss": 3.0987, + "theoretical_loss": 3.856035964656889, + "tokens_seen": 576457728 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041683049147442326, + "loss": 3.2728, + "theoretical_loss": 3.8559899020289947, + "tokens_seen": 576523264 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168204613841525, + "loss": 3.1595, + "theoretical_loss": 3.855943846102881, + "tokens_seen": 576588800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004168104312938816, + "loss": 3.0373, + "theoretical_loss": 3.8558977968768104, + "tokens_seen": 576654336 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 941938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.333979368209839, + "objective/train/theoretical_loss": 3.8558632643531174, + "objective/train/tokens_used": 597163488, + "theoretical_loss": 3.8558632643531174, + "tokens_seen": 576703488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041680040120361085, + "loss": 3.235, + "theoretical_loss": 3.8558517543490485, + "tokens_seen": 576719872 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041679037111334, + "loss": 3.1537, + "theoretical_loss": 3.855805718517858, + "tokens_seen": 576785408 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167803410230692, + "loss": 2.9451, + "theoretical_loss": 3.8557596893815047, + "tokens_seen": 576850944 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167703109327984, + "loss": 3.1416, + "theoretical_loss": 3.8557136669382555, + "tokens_seen": 576916480 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167602808425276, + "loss": 3.0266, + "theoretical_loss": 3.8556676511863763, + "tokens_seen": 576982016 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041675025075225676, + "loss": 2.8215, + "theoretical_loss": 3.8556216421241345, + "tokens_seen": 577047552 + }, + { + "epoch": 1.07, + "learning_rate": 0.000416740220661986, + "loss": 2.9044, + "theoretical_loss": 3.855575639749798, + "tokens_seen": 577113088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167301905717151, + "loss": 3.1797, + "theoretical_loss": 3.8555296440616353, + "tokens_seen": 577178624 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041672016048144436, + "loss": 2.9405, + "theoretical_loss": 3.855483655057916, + "tokens_seen": 577244160 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167101303911735, + "loss": 2.9341, + "theoretical_loss": 3.855437672736909, + "tokens_seen": 577309696 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004167001003009027, + "loss": 3.18, + "theoretical_loss": 3.8553916970968856, + "tokens_seen": 577375232 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004166900702106319, + "loss": 2.9301, + "theoretical_loss": 3.8553457281361165, + "tokens_seen": 577440768 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004166800401203611, + "loss": 2.8787, + "theoretical_loss": 3.8552997658528736, + "tokens_seen": 577506304 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041667001003009026, + "loss": 3.1502, + "theoretical_loss": 3.8552538102454292, + "tokens_seen": 577571840 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041665997993981944, + "loss": 2.9555, + "theoretical_loss": 3.8552078613120564, + "tokens_seen": 577637376 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004166499498495487, + "loss": 3.1006, + "theoretical_loss": 3.855161919051029, + "tokens_seen": 577702912 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041663991975927786, + "loss": 2.968, + "theoretical_loss": 3.8551159834606215, + "tokens_seen": 577768448 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041662988966900704, + "loss": 3.0258, + "theoretical_loss": 3.855070054539109, + "tokens_seen": 577833984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004166198595787362, + "loss": 3.1703, + "theoretical_loss": 3.8550241322847656, + "tokens_seen": 577899520 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004166098294884654, + "loss": 3.0554, + "theoretical_loss": 3.8549782166958693, + "tokens_seen": 577965056 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165997993981946, + "loss": 2.885, + "theoretical_loss": 3.8549323077706967, + "tokens_seen": 578030592 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165897693079238, + "loss": 3.1233, + "theoretical_loss": 3.854886405507525, + "tokens_seen": 578096128 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041657973921765295, + "loss": 3.1701, + "theoretical_loss": 3.854840509904632, + "tokens_seen": 578161664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165697091273822, + "loss": 3.0399, + "theoretical_loss": 3.8547946209602975, + "tokens_seen": 578227200 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041655967903711136, + "loss": 3.2498, + "theoretical_loss": 3.8547487386728, + "tokens_seen": 578292736 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 943112, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3646366596221924, + "objective/train/theoretical_loss": 3.8547143313246917, + "objective/train/tokens_used": 598801888, + "theoretical_loss": 3.8547143313246917, + "tokens_seen": 578341888 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041654964894684055, + "loss": 3.0736, + "theoretical_loss": 3.85470286304042, + "tokens_seen": 578358272 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165396188565697, + "loss": 3.1699, + "theoretical_loss": 3.8546569940614384, + "tokens_seen": 578423808 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165295887662989, + "loss": 2.952, + "theoretical_loss": 3.8546111317341363, + "tokens_seen": 578489344 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165195586760281, + "loss": 3.0938, + "theoretical_loss": 3.8545652760567957, + "tokens_seen": 578554880 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004165095285857573, + "loss": 3.0178, + "theoretical_loss": 3.8545194270276997, + "tokens_seen": 578620416 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041649949849548645, + "loss": 2.9269, + "theoretical_loss": 3.854473584645131, + "tokens_seen": 578685952 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164894684052157, + "loss": 3.0899, + "theoretical_loss": 3.854427748907374, + "tokens_seen": 578751488 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164794383149448, + "loss": 3.0104, + "theoretical_loss": 3.854381919812713, + "tokens_seen": 578817024 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041646940822467405, + "loss": 2.9623, + "theoretical_loss": 3.8543360973594334, + "tokens_seen": 578882560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041645937813440323, + "loss": 2.9099, + "theoretical_loss": 3.854290281545821, + "tokens_seen": 578948096 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164493480441324, + "loss": 2.9002, + "theoretical_loss": 3.854244472370162, + "tokens_seen": 579013632 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164393179538616, + "loss": 3.0403, + "theoretical_loss": 3.854198669830744, + "tokens_seen": 579079168 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164292878635908, + "loss": 3.0044, + "theoretical_loss": 3.854152873925854, + "tokens_seen": 579144704 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041641925777331995, + "loss": 3.0512, + "theoretical_loss": 3.854107084653781, + "tokens_seen": 579210240 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004164092276830492, + "loss": 3.058, + "theoretical_loss": 3.854061302012814, + "tokens_seen": 579275776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163991975927783, + "loss": 2.8909, + "theoretical_loss": 3.854015526001242, + "tokens_seen": 579341312 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041638916750250755, + "loss": 3.191, + "theoretical_loss": 3.8539697566173565, + "tokens_seen": 579406848 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041637913741223673, + "loss": 3.0747, + "theoretical_loss": 3.8539239938594476, + "tokens_seen": 579472384 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163691073219659, + "loss": 2.9186, + "theoretical_loss": 3.853878237725807, + "tokens_seen": 579537920 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163590772316951, + "loss": 3.0905, + "theoretical_loss": 3.8538324882147266, + "tokens_seen": 579603456 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163490471414243, + "loss": 3.0409, + "theoretical_loss": 3.8537867453244994, + "tokens_seen": 579668992 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041633901705115346, + "loss": 2.9862, + "theoretical_loss": 3.8537410090534197, + "tokens_seen": 579734528 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163289869608827, + "loss": 3.0016, + "theoretical_loss": 3.8536952793997803, + "tokens_seen": 579800064 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004163189568706118, + "loss": 3.1094, + "theoretical_loss": 3.8536495563618764, + "tokens_seen": 579865600 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041630892678034105, + "loss": 3.092, + "theoretical_loss": 3.8536038399380033, + "tokens_seen": 579931136 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 945877, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3596229553222656, + "objective/train/theoretical_loss": 3.8535695569595316, + "objective/train/tokens_used": 600440288, + "theoretical_loss": 3.8535695569595316, + "tokens_seen": 579980288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162988966900702, + "loss": 3.2177, + "theoretical_loss": 3.853558130126457, + "tokens_seen": 579996672 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162888665997994, + "loss": 3.0505, + "theoretical_loss": 3.853512426925535, + "tokens_seen": 580062208 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162788365095286, + "loss": 3.0994, + "theoretical_loss": 3.8534667303335333, + "tokens_seen": 580127744 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162688064192578, + "loss": 3.0763, + "theoretical_loss": 3.85342104034875, + "tokens_seen": 580193280 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041625877632898696, + "loss": 2.9265, + "theoretical_loss": 3.853375356969485, + "tokens_seen": 580258816 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162487462387162, + "loss": 2.9595, + "theoretical_loss": 3.853329680194035, + "tokens_seen": 580324352 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162387161484453, + "loss": 2.924, + "theoretical_loss": 3.8532840100207015, + "tokens_seen": 580389888 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041622868605817456, + "loss": 3.018, + "theoretical_loss": 3.8532383464477844, + "tokens_seen": 580455424 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162186559679037, + "loss": 3.1721, + "theoretical_loss": 3.853192689473585, + "tokens_seen": 580520960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004162086258776329, + "loss": 3.1826, + "theoretical_loss": 3.8531470390964047, + "tokens_seen": 580586496 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161985957873621, + "loss": 3.0961, + "theoretical_loss": 3.853101395314546, + "tokens_seen": 580652032 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161885656970913, + "loss": 3.0335, + "theoretical_loss": 3.8530557581263114, + "tokens_seen": 580717568 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041617853560682046, + "loss": 3.0227, + "theoretical_loss": 3.8530101275300046, + "tokens_seen": 580783104 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041616850551654964, + "loss": 3.1173, + "theoretical_loss": 3.85296450352393, + "tokens_seen": 580848640 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161584754262788, + "loss": 3.1658, + "theoretical_loss": 3.8529188861063925, + "tokens_seen": 580914176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041614844533600806, + "loss": 3.1911, + "theoretical_loss": 3.852873275275697, + "tokens_seen": 580979712 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161384152457372, + "loss": 3.1113, + "theoretical_loss": 3.85282767103015, + "tokens_seen": 581045248 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161283851554664, + "loss": 3.0021, + "theoretical_loss": 3.8527820733680587, + "tokens_seen": 581110784 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041611835506519555, + "loss": 3.2052, + "theoretical_loss": 3.852736482287729, + "tokens_seen": 581176320 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004161083249749248, + "loss": 3.1744, + "theoretical_loss": 3.852690897787469, + "tokens_seen": 581241856 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041609829488465397, + "loss": 3.0273, + "theoretical_loss": 3.852645319865589, + "tokens_seen": 581307392 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041608826479438315, + "loss": 2.9126, + "theoretical_loss": 3.8525997485203964, + "tokens_seen": 581372928 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041607823470411233, + "loss": 2.9251, + "theoretical_loss": 3.852554183750202, + "tokens_seen": 581438464 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041606820461384156, + "loss": 3.1419, + "theoretical_loss": 3.852508625553316, + "tokens_seen": 581504000 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004160581745235707, + "loss": 2.9517, + "theoretical_loss": 3.8524630739280488, + "tokens_seen": 581569536 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 948876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9106557369232178, + "objective/train/theoretical_loss": 3.852428914520708, + "objective/train/tokens_used": 602078688, + "theoretical_loss": 3.852428914520708, + "tokens_seen": 581618688 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004160481444332999, + "loss": 3.0592, + "theoretical_loss": 3.8524175288727127, + "tokens_seen": 581635072 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041603811434302905, + "loss": 2.8805, + "theoretical_loss": 3.8523719903856204, + "tokens_seen": 581700608 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004160280842527583, + "loss": 3.1226, + "theoretical_loss": 3.8523264584650834, + "tokens_seen": 581766144 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041601805416248747, + "loss": 2.8741, + "theoretical_loss": 3.8522809331094168, + "tokens_seen": 581831680 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041600802407221665, + "loss": 3.0476, + "theoretical_loss": 3.852235414316934, + "tokens_seen": 581897216 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041599799398194583, + "loss": 2.9772, + "theoretical_loss": 3.85218990208595, + "tokens_seen": 581962752 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415987963891675, + "loss": 3.0647, + "theoretical_loss": 3.85214439641478, + "tokens_seen": 582028288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004159779338014042, + "loss": 3.1339, + "theoretical_loss": 3.852098897301741, + "tokens_seen": 582093824 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041596790371113343, + "loss": 3.0075, + "theoretical_loss": 3.8520534047451482, + "tokens_seen": 582159360 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041595787362086256, + "loss": 2.8167, + "theoretical_loss": 3.85200791874332, + "tokens_seen": 582224896 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004159478435305918, + "loss": 3.2351, + "theoretical_loss": 3.8519624392945735, + "tokens_seen": 582290432 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004159378134403209, + "loss": 2.961, + "theoretical_loss": 3.8519169663972277, + "tokens_seen": 582355968 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041592778335005015, + "loss": 2.9769, + "theoretical_loss": 3.8518715000496018, + "tokens_seen": 582421504 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041591775325977934, + "loss": 3.0206, + "theoretical_loss": 3.8518260402500157, + "tokens_seen": 582487040 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004159077231695085, + "loss": 3.1565, + "theoretical_loss": 3.8517805869967887, + "tokens_seen": 582552576 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041589769307923775, + "loss": 2.91, + "theoretical_loss": 3.8517351402882434, + "tokens_seen": 582618112 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041588766298896693, + "loss": 2.8934, + "theoretical_loss": 3.8516897001227006, + "tokens_seen": 582683648 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004158776328986961, + "loss": 3.0345, + "theoretical_loss": 3.8516442664984822, + "tokens_seen": 582749184 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004158676028084253, + "loss": 2.9153, + "theoretical_loss": 3.851598839413912, + "tokens_seen": 582814720 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004158575727181545, + "loss": 3.0654, + "theoretical_loss": 3.8515534188673124, + "tokens_seen": 582880256 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041584754262788366, + "loss": 2.9278, + "theoretical_loss": 3.851508004857008, + "tokens_seen": 582945792 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004158375125376129, + "loss": 3.086, + "theoretical_loss": 3.8514625973813246, + "tokens_seen": 583011328 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415827482447342, + "loss": 3.0015, + "theoretical_loss": 3.8514171964385855, + "tokens_seen": 583076864 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041581745235707126, + "loss": 2.9041, + "theoretical_loss": 3.851371802027118, + "tokens_seen": 583142400 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004158074222668004, + "loss": 3.0837, + "theoretical_loss": 3.8513264141452486, + "tokens_seen": 583207936 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 951353, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.183842897415161, + "objective/train/theoretical_loss": 3.8512923775178884, + "objective/train/tokens_used": 603717088, + "theoretical_loss": 3.8512923775178884, + "tokens_seen": 583257088 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157973921765296, + "loss": 3.015, + "theoretical_loss": 3.851281032791304, + "tokens_seen": 583273472 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157873620862588, + "loss": 3.0829, + "theoretical_loss": 3.8512356579636124, + "tokens_seen": 583339008 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415777331995988, + "loss": 2.8001, + "theoretical_loss": 3.8511902896605017, + "tokens_seen": 583404544 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041576730190571716, + "loss": 3.228, + "theoretical_loss": 3.8511449278803016, + "tokens_seen": 583470080 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157572718154464, + "loss": 3.1607, + "theoretical_loss": 3.851099572621342, + "tokens_seen": 583535616 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157472417251755, + "loss": 3.0363, + "theoretical_loss": 3.8510542238819516, + "tokens_seen": 583601152 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041573721163490476, + "loss": 2.9157, + "theoretical_loss": 3.851008881660463, + "tokens_seen": 583666688 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157271815446339, + "loss": 3.0729, + "theoretical_loss": 3.850963545955207, + "tokens_seen": 583732224 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157171514543631, + "loss": 3.1177, + "theoretical_loss": 3.8509182167645157, + "tokens_seen": 583797760 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004157071213640923, + "loss": 2.874, + "theoretical_loss": 3.850872894086722, + "tokens_seen": 583863296 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004156970912738215, + "loss": 3.1923, + "theoretical_loss": 3.8508275779201586, + "tokens_seen": 583928832 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041568706118355066, + "loss": 2.9088, + "theoretical_loss": 3.85078226826316, + "tokens_seen": 583994368 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041567703109327984, + "loss": 3.0295, + "theoretical_loss": 3.850736965114061, + "tokens_seen": 584059904 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415667001003009, + "loss": 2.9158, + "theoretical_loss": 3.850691668471197, + "tokens_seen": 584125440 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041565697091273826, + "loss": 3.0714, + "theoretical_loss": 3.8506463783329026, + "tokens_seen": 584190976 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004156469408224674, + "loss": 2.9929, + "theoretical_loss": 3.850601094697515, + "tokens_seen": 584256512 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004156369107321966, + "loss": 2.9563, + "theoretical_loss": 3.850555817563371, + "tokens_seen": 584322048 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041562688064192575, + "loss": 3.0287, + "theoretical_loss": 3.850510546928809, + "tokens_seen": 584387584 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415616850551655, + "loss": 2.9658, + "theoretical_loss": 3.8504652827921664, + "tokens_seen": 584453120 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041560682046138417, + "loss": 2.9707, + "theoretical_loss": 3.8504200251517817, + "tokens_seen": 584518656 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041559679037111335, + "loss": 3.1128, + "theoretical_loss": 3.8503747740059957, + "tokens_seen": 584584192 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041558676028084253, + "loss": 2.8581, + "theoretical_loss": 3.850329529353147, + "tokens_seen": 584649728 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041557673019057176, + "loss": 3.0436, + "theoretical_loss": 3.850284291191577, + "tokens_seen": 584715264 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004155667001003009, + "loss": 3.022, + "theoretical_loss": 3.8502390595196276, + "tokens_seen": 584780800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004155566700100301, + "loss": 3.0739, + "theoretical_loss": 3.8501938343356397, + "tokens_seen": 584846336 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 954355, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.900529146194458, + "objective/train/theoretical_loss": 3.8501599197043763, + "objective/train/tokens_used": 605355488, + "theoretical_loss": 3.8501599197043763, + "tokens_seen": 584895488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041554663991975925, + "loss": 3.0125, + "theoretical_loss": 3.850148615637956, + "tokens_seen": 584911872 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004155366098294885, + "loss": 3.2104, + "theoretical_loss": 3.85010340342492, + "tokens_seen": 584977408 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041552657973921767, + "loss": 3.0367, + "theoretical_loss": 3.8500581976948753, + "tokens_seen": 585042944 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041551654964894685, + "loss": 2.9717, + "theoretical_loss": 3.850012998446166, + "tokens_seen": 585108480 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041550651955867603, + "loss": 3.0594, + "theoretical_loss": 3.849967805677137, + "tokens_seen": 585174016 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004154964894684052, + "loss": 2.7544, + "theoretical_loss": 3.8499226193861347, + "tokens_seen": 585239552 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004154864593781344, + "loss": 3.0167, + "theoretical_loss": 3.8498774395715043, + "tokens_seen": 585305088 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041547642928786363, + "loss": 3.0751, + "theoretical_loss": 3.849832266231593, + "tokens_seen": 585370624 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041546639919759276, + "loss": 3.0322, + "theoretical_loss": 3.8497870993647476, + "tokens_seen": 585436160 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415456369107322, + "loss": 2.9255, + "theoretical_loss": 3.8497419389693173, + "tokens_seen": 585501696 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004154463390170511, + "loss": 3.0693, + "theoretical_loss": 3.8496967850436494, + "tokens_seen": 585567232 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041543630892678035, + "loss": 3.1256, + "theoretical_loss": 3.8496516375860943, + "tokens_seen": 585632768 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041542627883650954, + "loss": 3.1751, + "theoretical_loss": 3.8496064965950008, + "tokens_seen": 585698304 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004154162487462387, + "loss": 3.0867, + "theoretical_loss": 3.8495613620687195, + "tokens_seen": 585763840 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004154062186559679, + "loss": 2.7428, + "theoretical_loss": 3.849516234005602, + "tokens_seen": 585829376 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041539618856569713, + "loss": 2.9645, + "theoretical_loss": 3.8494711124039993, + "tokens_seen": 585894912 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041538615847542626, + "loss": 3.1157, + "theoretical_loss": 3.849425997262264, + "tokens_seen": 585960448 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004153761283851555, + "loss": 3.1513, + "theoretical_loss": 3.8493808885787484, + "tokens_seen": 586025984 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004153660982948846, + "loss": 3.0502, + "theoretical_loss": 3.8493357863518067, + "tokens_seen": 586091520 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041535606820461386, + "loss": 3.2173, + "theoretical_loss": 3.849290690579792, + "tokens_seen": 586157056 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041534603811434304, + "loss": 3.0952, + "theoretical_loss": 3.84924560126106, + "tokens_seen": 586222592 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004153360080240722, + "loss": 3.0117, + "theoretical_loss": 3.849200518393965, + "tokens_seen": 586288128 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004153259779338014, + "loss": 2.9294, + "theoretical_loss": 3.849155441976863, + "tokens_seen": 586353664 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004153159478435306, + "loss": 3.0769, + "theoretical_loss": 3.8491103720081115, + "tokens_seen": 586419200 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041530591775325976, + "loss": 3.0588, + "theoretical_loss": 3.8490653084860664, + "tokens_seen": 586484736 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 957235, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7505781650543213, + "objective/train/theoretical_loss": 3.849031515074196, + "objective/train/tokens_used": 606993888, + "theoretical_loss": 3.849031515074196, + "tokens_seen": 586533888 + }, + { + "epoch": 1.07, + "learning_rate": 0.000415295887662989, + "loss": 2.9773, + "theoretical_loss": 3.8490202514090854, + "tokens_seen": 586550272 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152858575727181, + "loss": 2.893, + "theoretical_loss": 3.848975200775527, + "tokens_seen": 586615808 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041527582748244736, + "loss": 2.975, + "theoretical_loss": 3.8489301565837506, + "tokens_seen": 586681344 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152657973921765, + "loss": 2.9788, + "theoretical_loss": 3.848885118832115, + "tokens_seen": 586746880 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152557673019057, + "loss": 2.9803, + "theoretical_loss": 3.8488400875189805, + "tokens_seen": 586812416 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152457372116349, + "loss": 3.2023, + "theoretical_loss": 3.848795062642708, + "tokens_seen": 586877952 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152357071213641, + "loss": 2.8639, + "theoretical_loss": 3.8487500442016582, + "tokens_seen": 586943488 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041522567703109327, + "loss": 3.0875, + "theoretical_loss": 3.848705032194193, + "tokens_seen": 587009024 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004152156469408225, + "loss": 2.9996, + "theoretical_loss": 3.848660026618675, + "tokens_seen": 587074560 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041520561685055163, + "loss": 2.8772, + "theoretical_loss": 3.8486150274734676, + "tokens_seen": 587140096 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041519558676028086, + "loss": 3.0544, + "theoretical_loss": 3.848570034756934, + "tokens_seen": 587205632 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041518555667001, + "loss": 3.1574, + "theoretical_loss": 3.8485250484674385, + "tokens_seen": 587271168 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004151755265797392, + "loss": 3.0388, + "theoretical_loss": 3.8484800686033465, + "tokens_seen": 587336704 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004151654964894684, + "loss": 3.0765, + "theoretical_loss": 3.8484350951630226, + "tokens_seen": 587402240 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004151554663991976, + "loss": 2.9748, + "theoretical_loss": 3.8483901281448336, + "tokens_seen": 587467776 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004151454363089268, + "loss": 2.9189, + "theoretical_loss": 3.848345167547146, + "tokens_seen": 587533312 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041513540621865595, + "loss": 3.0692, + "theoretical_loss": 3.8483002133683266, + "tokens_seen": 587598848 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004151253761283852, + "loss": 3.0728, + "theoretical_loss": 3.8482552656067437, + "tokens_seen": 587664384 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041511534603811437, + "loss": 3.0838, + "theoretical_loss": 3.8482103242607657, + "tokens_seen": 587729920 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041510531594784355, + "loss": 3.1988, + "theoretical_loss": 3.848165389328762, + "tokens_seen": 587795456 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041509528585757273, + "loss": 3.0799, + "theoretical_loss": 3.848120460809101, + "tokens_seen": 587860992 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041508525576730196, + "loss": 2.9915, + "theoretical_loss": 3.848075538700154, + "tokens_seen": 587926528 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004150752256770311, + "loss": 3.0502, + "theoretical_loss": 3.848030623000291, + "tokens_seen": 587992064 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004150651955867603, + "loss": 3.0876, + "theoretical_loss": 3.8479857137078843, + "tokens_seen": 588057600 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041505516549648945, + "loss": 3.0191, + "theoretical_loss": 3.8479408108213065, + "tokens_seen": 588123136 + }, + { + "epoch": 1.07, + "objective/train/docs_used": 958723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8858883380889893, + "objective/train/theoretical_loss": 3.8479071378592176, + "objective/train/tokens_used": 608632288, + "theoretical_loss": 3.8479071378592176, + "tokens_seen": 588172288 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004150451354062187, + "loss": 3.0228, + "theoretical_loss": 3.847895914338928, + "tokens_seen": 588188672 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041503510531594787, + "loss": 2.9931, + "theoretical_loss": 3.847851024259124, + "tokens_seen": 588254208 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041502507522567705, + "loss": 3.0363, + "theoretical_loss": 3.847806140580267, + "tokens_seen": 588319744 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041501504513540623, + "loss": 3.0483, + "theoretical_loss": 3.8477612633007325, + "tokens_seen": 588385280 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004150050150451354, + "loss": 2.8061, + "theoretical_loss": 3.847716392418895, + "tokens_seen": 588450816 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004149949849548646, + "loss": 2.9193, + "theoretical_loss": 3.8476715279331297, + "tokens_seen": 588516352 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041498495486459383, + "loss": 2.986, + "theoretical_loss": 3.847626669841813, + "tokens_seen": 588581888 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041497492477432296, + "loss": 3.056, + "theoretical_loss": 3.847581818143323, + "tokens_seen": 588647424 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004149648946840522, + "loss": 3.1504, + "theoretical_loss": 3.8475369728360347, + "tokens_seen": 588712960 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004149548645937813, + "loss": 2.9706, + "theoretical_loss": 3.847492133918327, + "tokens_seen": 588778496 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041494483450351055, + "loss": 3.0385, + "theoretical_loss": 3.8474473013885797, + "tokens_seen": 588844032 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041493480441323974, + "loss": 2.9125, + "theoretical_loss": 3.84740247524517, + "tokens_seen": 588909568 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004149247743229689, + "loss": 2.9144, + "theoretical_loss": 3.847357655486479, + "tokens_seen": 588975104 + }, + { + "epoch": 1.07, + "learning_rate": 0.0004149147442326981, + "loss": 2.9938, + "theoretical_loss": 3.8473128421108864, + "tokens_seen": 589040640 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041490471414242733, + "loss": 3.1714, + "theoretical_loss": 3.8472680351167736, + "tokens_seen": 589106176 + }, + { + "epoch": 1.07, + "learning_rate": 0.00041489468405215646, + "loss": 3.0252, + "theoretical_loss": 3.847223234502522, + "tokens_seen": 589171712 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148846539618857, + "loss": 3.0042, + "theoretical_loss": 3.847178440266513, + "tokens_seen": 589237248 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148746238716148, + "loss": 3.2215, + "theoretical_loss": 3.8471336524071296, + "tokens_seen": 589302784 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041486459378134406, + "loss": 2.8203, + "theoretical_loss": 3.8470888709227555, + "tokens_seen": 589368320 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041485456369107324, + "loss": 2.952, + "theoretical_loss": 3.8470440958117744, + "tokens_seen": 589433856 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148445336008024, + "loss": 2.9955, + "theoretical_loss": 3.84699932707257, + "tokens_seen": 589499392 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148345035105316, + "loss": 3.0176, + "theoretical_loss": 3.846954564703529, + "tokens_seen": 589564928 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148244734202608, + "loss": 3.0683, + "theoretical_loss": 3.846909808703036, + "tokens_seen": 589630464 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041481444332998996, + "loss": 2.9814, + "theoretical_loss": 3.8468650590694766, + "tokens_seen": 589696000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004148044132397192, + "loss": 2.9985, + "theoretical_loss": 3.8468203158012386, + "tokens_seen": 589761536 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 961549, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1653852462768555, + "objective/train/theoretical_loss": 3.8467867625263317, + "objective/train/tokens_used": 610270688, + "theoretical_loss": 3.8467867625263317, + "tokens_seen": 589810688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147943831494483, + "loss": 3.004, + "theoretical_loss": 3.8467755788967084, + "tokens_seen": 589827072 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041478435305917756, + "loss": 3.1301, + "theoretical_loss": 3.8467308483542757, + "tokens_seen": 589892608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147743229689067, + "loss": 3.1086, + "theoretical_loss": 3.846686124172327, + "tokens_seen": 589958144 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147642928786359, + "loss": 3.0054, + "theoretical_loss": 3.846641406349253, + "tokens_seen": 590023680 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147542627883651, + "loss": 2.9542, + "theoretical_loss": 3.846596694883443, + "tokens_seen": 590089216 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147442326980943, + "loss": 3.0371, + "theoretical_loss": 3.846551989773287, + "tokens_seen": 590154752 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041473420260782347, + "loss": 2.9659, + "theoretical_loss": 3.8465072910171765, + "tokens_seen": 590220288 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004147241725175527, + "loss": 2.8853, + "theoretical_loss": 3.846462598613502, + "tokens_seen": 590285824 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041471414242728183, + "loss": 2.9144, + "theoretical_loss": 3.846417912560657, + "tokens_seen": 590351360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041470411233701106, + "loss": 3.0578, + "theoretical_loss": 3.846373232857033, + "tokens_seen": 590416896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004146940822467402, + "loss": 3.1278, + "theoretical_loss": 3.846328559501024, + "tokens_seen": 590482432 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004146840521564694, + "loss": 3.0067, + "theoretical_loss": 3.846283892491023, + "tokens_seen": 590547968 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004146740220661986, + "loss": 3.0216, + "theoretical_loss": 3.8462392318254253, + "tokens_seen": 590613504 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004146639919759278, + "loss": 3.0476, + "theoretical_loss": 3.846194577502626, + "tokens_seen": 590679040 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041465396188565697, + "loss": 2.8019, + "theoretical_loss": 3.8461499295210198, + "tokens_seen": 590744576 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041464393179538615, + "loss": 3.0438, + "theoretical_loss": 3.846105287879003, + "tokens_seen": 590810112 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041463390170511533, + "loss": 2.9569, + "theoretical_loss": 3.846060652574973, + "tokens_seen": 590875648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041462387161484457, + "loss": 3.0703, + "theoretical_loss": 3.8460160236073273, + "tokens_seen": 590941184 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004146138415245737, + "loss": 3.0099, + "theoretical_loss": 3.845971400974463, + "tokens_seen": 591006720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041460381143430293, + "loss": 3.1212, + "theoretical_loss": 3.845926784674779, + "tokens_seen": 591072256 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004145937813440321, + "loss": 2.9014, + "theoretical_loss": 3.8458821747066745, + "tokens_seen": 591137792 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004145837512537613, + "loss": 3.0063, + "theoretical_loss": 3.845837571068549, + "tokens_seen": 591203328 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004145737211634905, + "loss": 3.0509, + "theoretical_loss": 3.845792973758803, + "tokens_seen": 591268864 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041456369107321965, + "loss": 2.9318, + "theoretical_loss": 3.845748382775837, + "tokens_seen": 591334400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041455366098294884, + "loss": 2.993, + "theoretical_loss": 3.845703798118053, + "tokens_seen": 591399936 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 964467, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.849006175994873, + "objective/train/theoretical_loss": 3.8456703637746537, + "objective/train/tokens_used": 611909088, + "theoretical_loss": 3.8456703637746537, + "tokens_seen": 591449088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041454363089267807, + "loss": 2.9178, + "theoretical_loss": 3.845659219783852, + "tokens_seen": 591465472 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004145336008024072, + "loss": 2.9879, + "theoretical_loss": 3.8456146477716375, + "tokens_seen": 591531008 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041452357071213643, + "loss": 3.0605, + "theoretical_loss": 3.8455700820798127, + "tokens_seen": 591596544 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041451354062186556, + "loss": 2.8835, + "theoretical_loss": 3.8455255227067804, + "tokens_seen": 591662080 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004145035105315948, + "loss": 3.1039, + "theoretical_loss": 3.845480969650946, + "tokens_seen": 591727616 + }, + { + "epoch": 1.08, + "learning_rate": 0.000414493480441324, + "loss": 3.0774, + "theoretical_loss": 3.845436422910714, + "tokens_seen": 591793152 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041448345035105316, + "loss": 3.1616, + "theoretical_loss": 3.8453918824844893, + "tokens_seen": 591858688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041447342026078234, + "loss": 3.1588, + "theoretical_loss": 3.845347348370679, + "tokens_seen": 591924224 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004144633901705115, + "loss": 3.0634, + "theoretical_loss": 3.8453028205676887, + "tokens_seen": 591989760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004144533600802407, + "loss": 2.8994, + "theoretical_loss": 3.845258299073927, + "tokens_seen": 592055296 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041444332998996994, + "loss": 3.0099, + "theoretical_loss": 3.8452137838878, + "tokens_seen": 592120832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041443329989969906, + "loss": 2.9763, + "theoretical_loss": 3.8451692750077173, + "tokens_seen": 592186368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004144232698094283, + "loss": 3.01, + "theoretical_loss": 3.8451247724320874, + "tokens_seen": 592251904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041441323971915753, + "loss": 2.9773, + "theoretical_loss": 3.8450802761593197, + "tokens_seen": 592317440 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041440320962888666, + "loss": 2.9775, + "theoretical_loss": 3.8450357861878253, + "tokens_seen": 592382976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004143931795386159, + "loss": 3.1412, + "theoretical_loss": 3.8449913025160134, + "tokens_seen": 592448512 + }, + { + "epoch": 1.08, + "learning_rate": 0.000414383149448345, + "loss": 3.192, + "theoretical_loss": 3.844946825142296, + "tokens_seen": 592514048 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041437311935807426, + "loss": 3.0031, + "theoretical_loss": 3.844902354065085, + "tokens_seen": 592579584 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041436308926780344, + "loss": 3.2199, + "theoretical_loss": 3.844857889282793, + "tokens_seen": 592645120 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004143530591775326, + "loss": 2.9619, + "theoretical_loss": 3.844813430793832, + "tokens_seen": 592710656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004143430290872618, + "loss": 3.0193, + "theoretical_loss": 3.8447689785966173, + "tokens_seen": 592776192 + }, + { + "epoch": 1.08, + "learning_rate": 0.000414332998996991, + "loss": 2.9577, + "theoretical_loss": 3.844724532689561, + "tokens_seen": 592841728 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041432296890672016, + "loss": 2.9824, + "theoretical_loss": 3.8446800930710796, + "tokens_seen": 592907264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004143129388164494, + "loss": 2.9366, + "theoretical_loss": 3.8446356597395877, + "tokens_seen": 592972800 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004143029087261785, + "loss": 3.0488, + "theoretical_loss": 3.8445912326935003, + "tokens_seen": 593038336 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 966466, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1876606941223145, + "objective/train/theoretical_loss": 3.84455791653278, + "objective/train/tokens_used": 613547488, + "theoretical_loss": 3.84455791653278, + "tokens_seen": 593087488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041429287863590776, + "loss": 3.0322, + "theoretical_loss": 3.8445468119312354, + "tokens_seen": 593103872 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142828485456369, + "loss": 2.9967, + "theoretical_loss": 3.844502397451209, + "tokens_seen": 593169408 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142728184553661, + "loss": 3.0091, + "theoretical_loss": 3.844457989251839, + "tokens_seen": 593234944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142627883650953, + "loss": 3.0879, + "theoretical_loss": 3.8444135873315433, + "tokens_seen": 593300480 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142527582748245, + "loss": 2.9505, + "theoretical_loss": 3.844369191688741, + "tokens_seen": 593366016 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041424272818455367, + "loss": 3.0877, + "theoretical_loss": 3.844324802321851, + "tokens_seen": 593431552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142326980942829, + "loss": 2.9877, + "theoretical_loss": 3.8442804192292934, + "tokens_seen": 593497088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041422266800401203, + "loss": 3.0257, + "theoretical_loss": 3.844236042409489, + "tokens_seen": 593562624 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041421263791374126, + "loss": 3.1934, + "theoretical_loss": 3.8441916718608584, + "tokens_seen": 593628160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004142026078234704, + "loss": 3.0324, + "theoretical_loss": 3.844147307581823, + "tokens_seen": 593693696 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004141925777331996, + "loss": 3.1132, + "theoretical_loss": 3.8441029495708054, + "tokens_seen": 593759232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004141825476429288, + "loss": 3.1037, + "theoretical_loss": 3.844058597826228, + "tokens_seen": 593824768 + }, + { + "epoch": 1.08, + "learning_rate": 0.000414172517552658, + "loss": 3.1124, + "theoretical_loss": 3.844014252346515, + "tokens_seen": 593890304 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041416248746238717, + "loss": 3.0352, + "theoretical_loss": 3.8439699131300893, + "tokens_seen": 593955840 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041415245737211635, + "loss": 3.0733, + "theoretical_loss": 3.843925580175375, + "tokens_seen": 594021376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041414242728184553, + "loss": 2.8504, + "theoretical_loss": 3.8438812534807987, + "tokens_seen": 594086912 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041413239719157477, + "loss": 3.1643, + "theoretical_loss": 3.8438369330447846, + "tokens_seen": 594152448 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004141223671013039, + "loss": 3.0102, + "theoretical_loss": 3.843792618865759, + "tokens_seen": 594217984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041411233701103313, + "loss": 2.9778, + "theoretical_loss": 3.843748310942149, + "tokens_seen": 594283520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004141023069207623, + "loss": 3.0803, + "theoretical_loss": 3.843704009272382, + "tokens_seen": 594349056 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004140922768304915, + "loss": 3.2038, + "theoretical_loss": 3.843659713854886, + "tokens_seen": 594414592 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004140822467402207, + "loss": 2.9365, + "theoretical_loss": 3.843615424688089, + "tokens_seen": 594480128 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041407221664994985, + "loss": 3.0445, + "theoretical_loss": 3.8435711417704193, + "tokens_seen": 594545664 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041406218655967904, + "loss": 2.9032, + "theoretical_loss": 3.843526865100308, + "tokens_seen": 594611200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041405215646940827, + "loss": 3.0328, + "theoretical_loss": 3.8434825946761846, + "tokens_seen": 594676736 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 969373, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.224210500717163, + "objective/train/theoretical_loss": 3.8434493959560774, + "objective/train/tokens_used": 615185888, + "theoretical_loss": 3.8434493959560774, + "tokens_seen": 594725888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004140421263791374, + "loss": 3.1332, + "theoretical_loss": 3.84343833049648, + "tokens_seen": 594742272 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041403209628886663, + "loss": 2.9844, + "theoretical_loss": 3.8433940725596245, + "tokens_seen": 594807808 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041402206619859576, + "loss": 2.8821, + "theoretical_loss": 3.843349820864051, + "tokens_seen": 594873344 + }, + { + "epoch": 1.08, + "learning_rate": 0.000414012036108325, + "loss": 3.048, + "theoretical_loss": 3.8433055754081913, + "tokens_seen": 594938880 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004140020060180542, + "loss": 2.9031, + "theoretical_loss": 3.843261336190479, + "tokens_seen": 595004416 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041399197592778336, + "loss": 2.9746, + "theoretical_loss": 3.843217103209347, + "tokens_seen": 595069952 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041398194583751254, + "loss": 2.8853, + "theoretical_loss": 3.84317287646323, + "tokens_seen": 595135488 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004139719157472417, + "loss": 2.9369, + "theoretical_loss": 3.843128655950562, + "tokens_seen": 595201024 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004139618856569709, + "loss": 2.9873, + "theoretical_loss": 3.843084441669779, + "tokens_seen": 595266560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041395185556670014, + "loss": 3.077, + "theoretical_loss": 3.843040233619316, + "tokens_seen": 595332096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041394182547642926, + "loss": 3.0058, + "theoretical_loss": 3.84299603179761, + "tokens_seen": 595397632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004139317953861585, + "loss": 3.1512, + "theoretical_loss": 3.842951836203097, + "tokens_seen": 595463168 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004139217652958877, + "loss": 3.0601, + "theoretical_loss": 3.842907646834216, + "tokens_seen": 595528704 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041391173520561686, + "loss": 2.9784, + "theoretical_loss": 3.842863463689404, + "tokens_seen": 595594240 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041390170511534604, + "loss": 3.0312, + "theoretical_loss": 3.8428192867671, + "tokens_seen": 595659776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004138916750250752, + "loss": 3.0234, + "theoretical_loss": 3.8427751160657424, + "tokens_seen": 595725312 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004138816449348044, + "loss": 3.1113, + "theoretical_loss": 3.842730951583772, + "tokens_seen": 595790848 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041387161484453364, + "loss": 3.0865, + "theoretical_loss": 3.842686793319629, + "tokens_seen": 595856384 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041386158475426277, + "loss": 2.8526, + "theoretical_loss": 3.8426426412717536, + "tokens_seen": 595921920 + }, + { + "epoch": 1.08, + "learning_rate": 0.000413851554663992, + "loss": 2.9704, + "theoretical_loss": 3.8425984954385877, + "tokens_seen": 595987456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041384152457372113, + "loss": 3.1084, + "theoretical_loss": 3.8425543558185735, + "tokens_seen": 596052992 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041383149448345036, + "loss": 3.0968, + "theoretical_loss": 3.8425102224101533, + "tokens_seen": 596118528 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041382146439317954, + "loss": 2.9022, + "theoretical_loss": 3.8424660952117695, + "tokens_seen": 596184064 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004138114343029087, + "loss": 3.036, + "theoretical_loss": 3.8424219742218666, + "tokens_seen": 596249600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004138014042126379, + "loss": 2.9241, + "theoretical_loss": 3.842377859438889, + "tokens_seen": 596315136 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 972229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.099447727203369, + "objective/train/theoretical_loss": 3.8423447774240147, + "objective/train/tokens_used": 616824288, + "theoretical_loss": 3.8423447774240147, + "tokens_seen": 596364288 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004137913741223671, + "loss": 3.0857, + "theoretical_loss": 3.842333750861281, + "tokens_seen": 596380672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041378134403209627, + "loss": 2.9575, + "theoretical_loss": 3.8422896484874887, + "tokens_seen": 596446208 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004137713139418255, + "loss": 3.0211, + "theoretical_loss": 3.842245552315957, + "tokens_seen": 596511744 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041376128385155463, + "loss": 3.0404, + "theoretical_loss": 3.842201462345133, + "tokens_seen": 596577280 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041375125376128387, + "loss": 3.0444, + "theoretical_loss": 3.842157378573464, + "tokens_seen": 596642816 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041374122367101305, + "loss": 2.9655, + "theoretical_loss": 3.842113300999397, + "tokens_seen": 596708352 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041373119358074223, + "loss": 2.9632, + "theoretical_loss": 3.84206922962138, + "tokens_seen": 596773888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004137211634904714, + "loss": 3.1247, + "theoretical_loss": 3.842025164437863, + "tokens_seen": 596839424 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004137111334002006, + "loss": 3.0904, + "theoretical_loss": 3.8419811054472937, + "tokens_seen": 596904960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041370110330992977, + "loss": 3.173, + "theoretical_loss": 3.841937052648123, + "tokens_seen": 596970496 + }, + { + "epoch": 1.08, + "learning_rate": 0.000413691073219659, + "loss": 3.04, + "theoretical_loss": 3.8418930060388004, + "tokens_seen": 597036032 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041368104312938813, + "loss": 3.0189, + "theoretical_loss": 3.8418489656177783, + "tokens_seen": 597101568 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041367101303911737, + "loss": 3.0238, + "theoretical_loss": 3.8418049313835065, + "tokens_seen": 597167104 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041366098294884655, + "loss": 3.2183, + "theoretical_loss": 3.841760903334438, + "tokens_seen": 597232640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041365095285857573, + "loss": 3.1052, + "theoretical_loss": 3.841716881469026, + "tokens_seen": 597298176 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041364092276830497, + "loss": 2.9719, + "theoretical_loss": 3.8416728657857226, + "tokens_seen": 597363712 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004136308926780341, + "loss": 3.015, + "theoretical_loss": 3.841628856282982, + "tokens_seen": 597429248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041362086258776333, + "loss": 3.1746, + "theoretical_loss": 3.841584852959258, + "tokens_seen": 597494784 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004136108324974925, + "loss": 2.9822, + "theoretical_loss": 3.8415408558130064, + "tokens_seen": 597560320 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004136008024072217, + "loss": 2.9856, + "theoretical_loss": 3.841496864842682, + "tokens_seen": 597625856 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004135907723169509, + "loss": 3.002, + "theoretical_loss": 3.8414528800467407, + "tokens_seen": 597691392 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041358074222668005, + "loss": 3.1733, + "theoretical_loss": 3.8414089014236397, + "tokens_seen": 597756928 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041357071213640924, + "loss": 3.0447, + "theoretical_loss": 3.841364928971835, + "tokens_seen": 597822464 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041356068204613847, + "loss": 2.8644, + "theoretical_loss": 3.8413209626897853, + "tokens_seen": 597888000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004135506519558676, + "loss": 2.9086, + "theoretical_loss": 3.8412770025759477, + "tokens_seen": 597953536 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 974998, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9194183349609375, + "objective/train/theoretical_loss": 3.8412440365375327, + "objective/train/tokens_used": 618462688, + "theoretical_loss": 3.8412440365375327, + "tokens_seen": 598002688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041354062186559683, + "loss": 3.013, + "theoretical_loss": 3.841233048628782, + "tokens_seen": 598019072 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041353059177532596, + "loss": 3.0875, + "theoretical_loss": 3.841189100846747, + "tokens_seen": 598084608 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004135205616850552, + "loss": 2.9756, + "theoretical_loss": 3.8411451592283026, + "tokens_seen": 598150144 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004135105315947844, + "loss": 2.9891, + "theoretical_loss": 3.8411012237719087, + "tokens_seen": 598215680 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041350050150451356, + "loss": 2.9781, + "theoretical_loss": 3.841057294476027, + "tokens_seen": 598281216 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041349047141424274, + "loss": 3.0258, + "theoretical_loss": 3.8410133713391184, + "tokens_seen": 598346752 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004134804413239719, + "loss": 2.7142, + "theoretical_loss": 3.8409694543596458, + "tokens_seen": 598412288 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004134704112337011, + "loss": 2.9428, + "theoretical_loss": 3.8409255435360707, + "tokens_seen": 598477824 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041346038114343034, + "loss": 3.0602, + "theoretical_loss": 3.8408816388668576, + "tokens_seen": 598543360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041345035105315946, + "loss": 3.2674, + "theoretical_loss": 3.8408377403504694, + "tokens_seen": 598608896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004134403209628887, + "loss": 2.8756, + "theoretical_loss": 3.8407938479853696, + "tokens_seen": 598674432 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004134302908726179, + "loss": 2.9974, + "theoretical_loss": 3.8407499617700247, + "tokens_seen": 598739968 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041342026078234706, + "loss": 3.038, + "theoretical_loss": 3.840706081702899, + "tokens_seen": 598805504 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041341023069207624, + "loss": 3.0185, + "theoretical_loss": 3.8406622077824584, + "tokens_seen": 598871040 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004134002006018054, + "loss": 2.9893, + "theoretical_loss": 3.84061834000717, + "tokens_seen": 598936576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004133901705115346, + "loss": 3.1061, + "theoretical_loss": 3.8405744783755003, + "tokens_seen": 599002112 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041338014042126384, + "loss": 2.9191, + "theoretical_loss": 3.8405306228859164, + "tokens_seen": 599067648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041337011033099297, + "loss": 3.1416, + "theoretical_loss": 3.8404867735368877, + "tokens_seen": 599133184 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004133600802407222, + "loss": 2.9633, + "theoretical_loss": 3.8404429303268826, + "tokens_seen": 599198720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041335005015045133, + "loss": 3.064, + "theoretical_loss": 3.8403990932543692, + "tokens_seen": 599264256 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041334002006018056, + "loss": 3.0741, + "theoretical_loss": 3.840355262317818, + "tokens_seen": 599329792 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041332998996990975, + "loss": 3.1355, + "theoretical_loss": 3.8403114375156995, + "tokens_seen": 599395328 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004133199598796389, + "loss": 2.965, + "theoretical_loss": 3.8402676188464846, + "tokens_seen": 599460864 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004133099297893681, + "loss": 3.0307, + "theoretical_loss": 3.840223806308644, + "tokens_seen": 599526400 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004132998996990973, + "loss": 3.0494, + "theoretical_loss": 3.8401799999006507, + "tokens_seen": 599591936 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 977873, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0760278701782227, + "objective/train/theoretical_loss": 3.8401471491164485, + "objective/train/tokens_used": 620101088, + "theoretical_loss": 3.8401471491164485, + "tokens_seen": 599641088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041328986960882647, + "loss": 3.08, + "theoretical_loss": 3.840136199620976, + "tokens_seen": 599657472 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004132798395185557, + "loss": 2.9928, + "theoretical_loss": 3.840092405468094, + "tokens_seen": 599723008 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041326980942828483, + "loss": 3.0661, + "theoretical_loss": 3.840048617440478, + "tokens_seen": 599788544 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041325977933801407, + "loss": 2.9879, + "theoretical_loss": 3.840004835536602, + "tokens_seen": 599854080 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041324974924774325, + "loss": 2.8988, + "theoretical_loss": 3.8399610597549407, + "tokens_seen": 599919616 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041323971915747243, + "loss": 3.1188, + "theoretical_loss": 3.8399172900939695, + "tokens_seen": 599985152 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004132296890672016, + "loss": 2.9658, + "theoretical_loss": 3.8398735265521644, + "tokens_seen": 600050688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004132196589769308, + "loss": 2.9986, + "theoretical_loss": 3.8398297691280012, + "tokens_seen": 600116224 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041320962888665997, + "loss": 3.0389, + "theoretical_loss": 3.839786017819957, + "tokens_seen": 600181760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131995987963892, + "loss": 3.1247, + "theoretical_loss": 3.839742272626509, + "tokens_seen": 600247296 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041318956870611833, + "loss": 3.0677, + "theoretical_loss": 3.8396985335461356, + "tokens_seen": 600312832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041317953861584757, + "loss": 3.0485, + "theoretical_loss": 3.839654800577316, + "tokens_seen": 600378368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131695085255767, + "loss": 2.9576, + "theoretical_loss": 3.839611073718527, + "tokens_seen": 600443904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041315947843530593, + "loss": 3.1009, + "theoretical_loss": 3.8395673529682504, + "tokens_seen": 600509440 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131494483450351, + "loss": 3.009, + "theoretical_loss": 3.8395236383249656, + "tokens_seen": 600574976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131394182547643, + "loss": 3.0439, + "theoretical_loss": 3.839479929787153, + "tokens_seen": 600640512 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131293881644935, + "loss": 2.9167, + "theoretical_loss": 3.8394362273532945, + "tokens_seen": 600706048 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004131193580742227, + "loss": 3.0086, + "theoretical_loss": 3.839392531021871, + "tokens_seen": 600771584 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041310932798395184, + "loss": 3.0385, + "theoretical_loss": 3.8393488407913656, + "tokens_seen": 600837120 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130992978936811, + "loss": 3.1567, + "theoretical_loss": 3.839305156660261, + "tokens_seen": 600902656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130892678034102, + "loss": 2.9883, + "theoretical_loss": 3.8392614786270407, + "tokens_seen": 600968192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041307923771313944, + "loss": 2.8769, + "theoretical_loss": 3.839217806690188, + "tokens_seen": 601033728 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130692076228686, + "loss": 3.0023, + "theoretical_loss": 3.839174140848188, + "tokens_seen": 601099264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130591775325978, + "loss": 2.8735, + "theoretical_loss": 3.839130481099526, + "tokens_seen": 601164800 + }, + { + "epoch": 1.08, + "learning_rate": 0.000413049147442327, + "loss": 3.0703, + "theoretical_loss": 3.839086827442687, + "tokens_seen": 601230336 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 980509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.051846742630005, + "objective/train/theoretical_loss": 3.839054091196906, + "objective/train/tokens_used": 621739488, + "theoretical_loss": 3.839054091196906, + "tokens_seen": 601279488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041303911735205616, + "loss": 3.1974, + "theoretical_loss": 3.839043179876157, + "tokens_seen": 601295872 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041302908726178534, + "loss": 3.0845, + "theoretical_loss": 3.838999538398423, + "tokens_seen": 601361408 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130190571715146, + "loss": 2.9506, + "theoretical_loss": 3.8389559030079723, + "tokens_seen": 601426944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004130090270812437, + "loss": 2.8117, + "theoretical_loss": 3.8389122737032926, + "tokens_seen": 601492480 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041299899699097294, + "loss": 2.9768, + "theoretical_loss": 3.838868650482872, + "tokens_seen": 601558016 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041298896690070207, + "loss": 3.2143, + "theoretical_loss": 3.8388250333451994, + "tokens_seen": 601623552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004129789368104313, + "loss": 2.7044, + "theoretical_loss": 3.8387814222887644, + "tokens_seen": 601689088 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004129689067201605, + "loss": 3.0972, + "theoretical_loss": 3.8387378173120563, + "tokens_seen": 601754624 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041295887662988966, + "loss": 3.0888, + "theoretical_loss": 3.838694218413566, + "tokens_seen": 601820160 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041294884653961884, + "loss": 2.887, + "theoretical_loss": 3.838650625591785, + "tokens_seen": 601885696 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004129388164493481, + "loss": 3.0146, + "theoretical_loss": 3.838607038845204, + "tokens_seen": 601951232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004129287863590772, + "loss": 3.1893, + "theoretical_loss": 3.838563458172315, + "tokens_seen": 602016768 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041291875626880644, + "loss": 2.9667, + "theoretical_loss": 3.8385198835716112, + "tokens_seen": 602082304 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004129087261785356, + "loss": 2.8605, + "theoretical_loss": 3.8384763150415853, + "tokens_seen": 602147840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004128986960882648, + "loss": 2.9565, + "theoretical_loss": 3.838432752580731, + "tokens_seen": 602213376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041288866599799404, + "loss": 2.9827, + "theoretical_loss": 3.838389196187543, + "tokens_seen": 602278912 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041287863590772317, + "loss": 3.0457, + "theoretical_loss": 3.838345645860515, + "tokens_seen": 602344448 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004128686058174524, + "loss": 2.949, + "theoretical_loss": 3.8383021015981433, + "tokens_seen": 602409984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041285857572718153, + "loss": 2.9036, + "theoretical_loss": 3.8382585633989237, + "tokens_seen": 602475520 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041284854563691076, + "loss": 2.9741, + "theoretical_loss": 3.838215031261352, + "tokens_seen": 602541056 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041283851554663995, + "loss": 3.1373, + "theoretical_loss": 3.8381715051839254, + "tokens_seen": 602606592 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004128284854563691, + "loss": 3.1393, + "theoretical_loss": 3.8381279851651415, + "tokens_seen": 602672128 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004128184553660983, + "loss": 3.0214, + "theoretical_loss": 3.838084471203498, + "tokens_seen": 602737664 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004128084252758275, + "loss": 3.1123, + "theoretical_loss": 3.8380409632974932, + "tokens_seen": 602803200 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041279839518555667, + "loss": 3.0441, + "theoretical_loss": 3.8379974614456263, + "tokens_seen": 602868736 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 983455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0751092433929443, + "objective/train/theoretical_loss": 3.837964839028852, + "objective/train/tokens_used": 623377888, + "theoretical_loss": 3.837964839028852, + "tokens_seen": 602917888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004127883650952859, + "loss": 3.021, + "theoretical_loss": 3.8379539656463972, + "tokens_seen": 602934272 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041277833500501503, + "loss": 2.999, + "theoretical_loss": 3.8379104758983056, + "tokens_seen": 602999808 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041276830491474427, + "loss": 2.9916, + "theoretical_loss": 3.837866992199853, + "tokens_seen": 603065344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041275827482447345, + "loss": 3.1436, + "theoretical_loss": 3.837823514549539, + "tokens_seen": 603130880 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041274824473420263, + "loss": 3.0579, + "theoretical_loss": 3.837780042945867, + "tokens_seen": 603196416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004127382146439318, + "loss": 3.1, + "theoretical_loss": 3.8377365773873384, + "tokens_seen": 603261952 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412728184553661, + "loss": 3.1532, + "theoretical_loss": 3.8376931178724556, + "tokens_seen": 603327488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041271815446339017, + "loss": 3.1164, + "theoretical_loss": 3.8376496643997227, + "tokens_seen": 603393024 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004127081243731194, + "loss": 3.044, + "theoretical_loss": 3.837606216967643, + "tokens_seen": 603458560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041269809428284854, + "loss": 2.9982, + "theoretical_loss": 3.837562775574721, + "tokens_seen": 603524096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041268806419257777, + "loss": 2.9958, + "theoretical_loss": 3.837519340219462, + "tokens_seen": 603589632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126780341023069, + "loss": 2.7806, + "theoretical_loss": 3.837475910900371, + "tokens_seen": 603655168 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041266800401203613, + "loss": 3.0903, + "theoretical_loss": 3.837432487615954, + "tokens_seen": 603720704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126579739217653, + "loss": 2.9293, + "theoretical_loss": 3.8373890703647175, + "tokens_seen": 603786240 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126479438314945, + "loss": 3.0689, + "theoretical_loss": 3.8373456591451696, + "tokens_seen": 603851776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126379137412237, + "loss": 3.0771, + "theoretical_loss": 3.837302253955816, + "tokens_seen": 603917312 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126278836509529, + "loss": 2.9339, + "theoretical_loss": 3.8372588547951665, + "tokens_seen": 603982848 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041261785356068204, + "loss": 3.0962, + "theoretical_loss": 3.8372154616617284, + "tokens_seen": 604048384 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004126078234704113, + "loss": 3.0525, + "theoretical_loss": 3.8371720745540117, + "tokens_seen": 604113920 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004125977933801404, + "loss": 3.1821, + "theoretical_loss": 3.837128693470526, + "tokens_seen": 604179456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041258776328986964, + "loss": 2.9045, + "theoretical_loss": 3.8370853184097813, + "tokens_seen": 604244992 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004125777331995988, + "loss": 3.0926, + "theoretical_loss": 3.8370419493702883, + "tokens_seen": 604310528 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412567703109328, + "loss": 3.0931, + "theoretical_loss": 3.8369985863505587, + "tokens_seen": 604376064 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004125576730190572, + "loss": 3.0414, + "theoretical_loss": 3.836955229349104, + "tokens_seen": 604441600 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041254764292878636, + "loss": 3.0148, + "theoretical_loss": 3.836911878364436, + "tokens_seen": 604507136 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 984842, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0774950981140137, + "objective/train/theoretical_loss": 3.836879369073557, + "objective/train/tokens_used": 625016288, + "theoretical_loss": 3.836879369073557, + "tokens_seen": 604556288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041253761283851554, + "loss": 3.0187, + "theoretical_loss": 3.8368685333950685, + "tokens_seen": 604572672 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004125275827482448, + "loss": 3.0517, + "theoretical_loss": 3.836825194439515, + "tokens_seen": 604638208 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004125175526579739, + "loss": 2.8922, + "theoretical_loss": 3.8367818614962887, + "tokens_seen": 604703744 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041250752256770314, + "loss": 3.0062, + "theoretical_loss": 3.8367385345639042, + "tokens_seen": 604769280 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041249749247743227, + "loss": 3.0448, + "theoretical_loss": 3.8366952136408767, + "tokens_seen": 604834816 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004124874623871615, + "loss": 3.2584, + "theoretical_loss": 3.836651898725722, + "tokens_seen": 604900352 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004124774322968907, + "loss": 3.1016, + "theoretical_loss": 3.8366085898169553, + "tokens_seen": 604965888 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041246740220661986, + "loss": 3.0936, + "theoretical_loss": 3.836565286913094, + "tokens_seen": 605031424 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041245737211634904, + "loss": 3.0957, + "theoretical_loss": 3.836521990012655, + "tokens_seen": 605096960 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004124473420260783, + "loss": 3.0364, + "theoretical_loss": 3.8364786991141555, + "tokens_seen": 605162496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004124373119358074, + "loss": 3.1589, + "theoretical_loss": 3.8364354142161137, + "tokens_seen": 605228032 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041242728184553664, + "loss": 2.9862, + "theoretical_loss": 3.8363921353170487, + "tokens_seen": 605293568 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041241725175526577, + "loss": 3.1907, + "theoretical_loss": 3.8363488624154796, + "tokens_seen": 605359104 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412407221664995, + "loss": 3.1097, + "theoretical_loss": 3.836305595509926, + "tokens_seen": 605424640 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004123971915747242, + "loss": 3.1064, + "theoretical_loss": 3.8362623345989086, + "tokens_seen": 605490176 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041238716148445337, + "loss": 2.9995, + "theoretical_loss": 3.8362190796809474, + "tokens_seen": 605555712 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041237713139418255, + "loss": 3.0024, + "theoretical_loss": 3.8361758307545637, + "tokens_seen": 605621248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041236710130391173, + "loss": 3.0189, + "theoretical_loss": 3.8361325878182804, + "tokens_seen": 605686784 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004123570712136409, + "loss": 3.0344, + "theoretical_loss": 3.836089350870619, + "tokens_seen": 605752320 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041234704112337015, + "loss": 2.8063, + "theoretical_loss": 3.836046119910103, + "tokens_seen": 605817856 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041233701103309927, + "loss": 3.0322, + "theoretical_loss": 3.8360028949352545, + "tokens_seen": 605883392 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004123269809428285, + "loss": 3.0213, + "theoretical_loss": 3.835959675944598, + "tokens_seen": 605948928 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041231695085255763, + "loss": 2.8566, + "theoretical_loss": 3.835916462936659, + "tokens_seen": 606014464 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041230692076228687, + "loss": 2.9651, + "theoretical_loss": 3.8358732559099615, + "tokens_seen": 606080000 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041229689067201605, + "loss": 2.7715, + "theoretical_loss": 3.835830054863031, + "tokens_seen": 606145536 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 987508, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9807865619659424, + "objective/train/theoretical_loss": 3.835797658001169, + "objective/train/tokens_used": 626654688, + "theoretical_loss": 3.835797658001169, + "tokens_seen": 606194688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041228686058174523, + "loss": 3.2271, + "theoretical_loss": 3.835786859794394, + "tokens_seen": 606211072 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004122768304914744, + "loss": 2.8632, + "theoretical_loss": 3.8357436707025765, + "tokens_seen": 606276608 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041226680040120365, + "loss": 2.7659, + "theoretical_loss": 3.835700487586106, + "tokens_seen": 606342144 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004122567703109328, + "loss": 3.0047, + "theoretical_loss": 3.8356573104435094, + "tokens_seen": 606407680 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412246740220662, + "loss": 3.0055, + "theoretical_loss": 3.835614139273316, + "tokens_seen": 606473216 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041223671013039114, + "loss": 2.8856, + "theoretical_loss": 3.835570974074053, + "tokens_seen": 606538752 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004122266800401204, + "loss": 2.9551, + "theoretical_loss": 3.835527814844251, + "tokens_seen": 606604288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041221664994984955, + "loss": 3.0606, + "theoretical_loss": 3.8354846615824387, + "tokens_seen": 606669824 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041220661985957874, + "loss": 2.9841, + "theoretical_loss": 3.8354415142871465, + "tokens_seen": 606735360 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121965897693079, + "loss": 2.8777, + "theoretical_loss": 3.835398372956905, + "tokens_seen": 606800896 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121865596790371, + "loss": 3.1574, + "theoretical_loss": 3.835355237590246, + "tokens_seen": 606866432 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121765295887663, + "loss": 2.9245, + "theoretical_loss": 3.8353121081857005, + "tokens_seen": 606931968 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121664994984955, + "loss": 2.7968, + "theoretical_loss": 3.835268984741801, + "tokens_seen": 606997504 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121564694082247, + "loss": 3.1327, + "theoretical_loss": 3.8352258672570807, + "tokens_seen": 607063040 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121464393179539, + "loss": 3.0982, + "theoretical_loss": 3.835182755730072, + "tokens_seen": 607128576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121364092276831, + "loss": 3.0955, + "theoretical_loss": 3.8351396501593102, + "tokens_seen": 607194112 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041212637913741224, + "loss": 3.0337, + "theoretical_loss": 3.8350965505433283, + "tokens_seen": 607259648 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121163490471415, + "loss": 3.1495, + "theoretical_loss": 3.8350534568806616, + "tokens_seen": 607325184 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004121063189568706, + "loss": 3.0894, + "theoretical_loss": 3.8350103691698463, + "tokens_seen": 607390720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041209628886659984, + "loss": 3.171, + "theoretical_loss": 3.834967287409417, + "tokens_seen": 607456256 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412086258776329, + "loss": 3.1597, + "theoretical_loss": 3.8349242115979107, + "tokens_seen": 607521792 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004120762286860582, + "loss": 3.2156, + "theoretical_loss": 3.8348811417338644, + "tokens_seen": 607587328 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004120661985957874, + "loss": 2.8692, + "theoretical_loss": 3.8348380778158155, + "tokens_seen": 607652864 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041205616850551656, + "loss": 2.9769, + "theoretical_loss": 3.8347950198423018, + "tokens_seen": 607718400 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041204613841524574, + "loss": 2.9974, + "theoretical_loss": 3.8347519678118624, + "tokens_seen": 607783936 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 990422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9231116771698, + "objective/train/theoretical_loss": 3.834719682688296, + "objective/train/tokens_used": 628293088, + "theoretical_loss": 3.834719682688296, + "tokens_seen": 607833088 + }, + { + "epoch": 1.08, + "learning_rate": 0.000412036108324975, + "loss": 3.135, + "theoretical_loss": 3.8347089217230357, + "tokens_seen": 607849472 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004120260782347041, + "loss": 3.0485, + "theoretical_loss": 3.8346658815743613, + "tokens_seen": 607915008 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041201604814443334, + "loss": 3.1094, + "theoretical_loss": 3.834622847364379, + "tokens_seen": 607980544 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041200601805416247, + "loss": 2.8096, + "theoretical_loss": 3.8345798190916307, + "tokens_seen": 608046080 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119959879638917, + "loss": 2.9784, + "theoretical_loss": 3.834536796754656, + "tokens_seen": 608111616 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119859578736209, + "loss": 3.0907, + "theoretical_loss": 3.834493780351997, + "tokens_seen": 608177152 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041197592778335006, + "loss": 3.0197, + "theoretical_loss": 3.8344507698821957, + "tokens_seen": 608242688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041196589769307924, + "loss": 2.9664, + "theoretical_loss": 3.8344077653437947, + "tokens_seen": 608308224 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119558676028085, + "loss": 2.9605, + "theoretical_loss": 3.8343647667353373, + "tokens_seen": 608373760 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119458375125376, + "loss": 2.8776, + "theoretical_loss": 3.8343217740553674, + "tokens_seen": 608439296 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041193580742226684, + "loss": 3.1025, + "theoretical_loss": 3.834278787302429, + "tokens_seen": 608504832 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041192577733199597, + "loss": 2.9493, + "theoretical_loss": 3.834235806475066, + "tokens_seen": 608570368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119157472417252, + "loss": 3.0837, + "theoretical_loss": 3.8341928315718246, + "tokens_seen": 608635904 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004119057171514544, + "loss": 3.0486, + "theoretical_loss": 3.83414986259125, + "tokens_seen": 608701440 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041189568706118357, + "loss": 2.9664, + "theoretical_loss": 3.8341068995318888, + "tokens_seen": 608766976 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041188565697091275, + "loss": 2.84, + "theoretical_loss": 3.8340639423922878, + "tokens_seen": 608832512 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041187562688064193, + "loss": 3.0444, + "theoretical_loss": 3.8340209911709935, + "tokens_seen": 608898048 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004118655967903711, + "loss": 2.9223, + "theoretical_loss": 3.833978045866554, + "tokens_seen": 608963584 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041185556670010035, + "loss": 2.8453, + "theoretical_loss": 3.8339351064775182, + "tokens_seen": 609029120 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041184553660982947, + "loss": 3.0078, + "theoretical_loss": 3.8338921730024342, + "tokens_seen": 609094656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004118355065195587, + "loss": 3.0131, + "theoretical_loss": 3.833849245439852, + "tokens_seen": 609160192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041182547642928783, + "loss": 3.0697, + "theoretical_loss": 3.8338063237883198, + "tokens_seen": 609225728 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041181544633901707, + "loss": 2.9675, + "theoretical_loss": 3.83376340804639, + "tokens_seen": 609291264 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041180541624874625, + "loss": 2.8143, + "theoretical_loss": 3.833720498212612, + "tokens_seen": 609356800 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041179538615847543, + "loss": 3.0922, + "theoretical_loss": 3.833677594285538, + "tokens_seen": 609422336 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 992867, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.691870927810669, + "objective/train/theoretical_loss": 3.8336454202156354, + "objective/train/tokens_used": 629931488, + "theoretical_loss": 3.8336454202156354, + "tokens_seen": 609471488 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004117853560682046, + "loss": 2.7919, + "theoretical_loss": 3.8336346962637187, + "tokens_seen": 609487872 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041177532597793385, + "loss": 2.9709, + "theoretical_loss": 3.833591804145708, + "tokens_seen": 609553408 + }, + { + "epoch": 1.08, + "learning_rate": 0.000411765295887663, + "loss": 2.9451, + "theoretical_loss": 3.8335489179300577, + "tokens_seen": 609618944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004117552657973922, + "loss": 3.0297, + "theoretical_loss": 3.8335060376153214, + "tokens_seen": 609684480 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041174523570712134, + "loss": 2.9622, + "theoretical_loss": 3.8334631632000535, + "tokens_seen": 609750016 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004117352056168506, + "loss": 3.0537, + "theoretical_loss": 3.8334202946828073, + "tokens_seen": 609815552 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041172517552657975, + "loss": 3.0205, + "theoretical_loss": 3.8333774320621394, + "tokens_seen": 609881088 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041171514543630894, + "loss": 3.1006, + "theoretical_loss": 3.8333345753366035, + "tokens_seen": 609946624 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004117051153460381, + "loss": 3.0352, + "theoretical_loss": 3.8332917245047566, + "tokens_seen": 610012160 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116950852557673, + "loss": 3.0553, + "theoretical_loss": 3.8332488795651543, + "tokens_seen": 610077696 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116850551654965, + "loss": 3.0529, + "theoretical_loss": 3.8332060405163544, + "tokens_seen": 610143232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116750250752257, + "loss": 3.0127, + "theoretical_loss": 3.8331632073569146, + "tokens_seen": 610208768 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041166499498495484, + "loss": 2.9644, + "theoretical_loss": 3.8331203800853917, + "tokens_seen": 610274304 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116549648946841, + "loss": 2.973, + "theoretical_loss": 3.833077558700345, + "tokens_seen": 610339840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116449348044132, + "loss": 2.9331, + "theoretical_loss": 3.833034743200333, + "tokens_seen": 610405376 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041163490471414244, + "loss": 3.0783, + "theoretical_loss": 3.8329919335839158, + "tokens_seen": 610470912 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116248746238716, + "loss": 3.2036, + "theoretical_loss": 3.8329491298496525, + "tokens_seen": 610536448 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004116148445336008, + "loss": 2.8942, + "theoretical_loss": 3.8329063319961048, + "tokens_seen": 610601984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041160481444333, + "loss": 3.1511, + "theoretical_loss": 3.8328635400218327, + "tokens_seen": 610667520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115947843530592, + "loss": 3.0786, + "theoretical_loss": 3.8328207539253984, + "tokens_seen": 610733056 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041158475426278834, + "loss": 3.0982, + "theoretical_loss": 3.8327779737053636, + "tokens_seen": 610798592 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115747241725176, + "loss": 2.8939, + "theoretical_loss": 3.8327351993602905, + "tokens_seen": 610864128 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115646940822467, + "loss": 3.1116, + "theoretical_loss": 3.8326924308887427, + "tokens_seen": 610929664 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041155466399197594, + "loss": 2.9279, + "theoretical_loss": 3.8326496682892834, + "tokens_seen": 610995200 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115446339017051, + "loss": 3.1418, + "theoretical_loss": 3.832606911560477, + "tokens_seen": 611060736 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 995797, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.207310914993286, + "objective/train/theoretical_loss": 3.832574847865624, + "objective/train/tokens_used": 631569888, + "theoretical_loss": 3.832574847865624, + "tokens_seen": 611109888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115346038114343, + "loss": 3.0943, + "theoretical_loss": 3.832564160700888, + "tokens_seen": 611126272 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004115245737211635, + "loss": 2.8869, + "theoretical_loss": 3.8325214157090803, + "tokens_seen": 611191808 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041151454363089267, + "loss": 2.9761, + "theoretical_loss": 3.832478676583622, + "tokens_seen": 611257344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041150451354062185, + "loss": 3.0906, + "theoretical_loss": 3.832435943323077, + "tokens_seen": 611322880 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004114944834503511, + "loss": 2.9312, + "theoretical_loss": 3.832393215926012, + "tokens_seen": 611388416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004114844533600802, + "loss": 2.9624, + "theoretical_loss": 3.8323504943909947, + "tokens_seen": 611453952 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041147442326980944, + "loss": 2.9751, + "theoretical_loss": 3.832307778716593, + "tokens_seen": 611519488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041146439317953857, + "loss": 3.0164, + "theoretical_loss": 3.832265068901374, + "tokens_seen": 611585024 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004114543630892678, + "loss": 2.9236, + "theoretical_loss": 3.8322223649439073, + "tokens_seen": 611650560 + }, + { + "epoch": 1.08, + "learning_rate": 0.000411444332998997, + "loss": 2.9211, + "theoretical_loss": 3.8321796668427615, + "tokens_seen": 611716096 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041143430290872617, + "loss": 3.0451, + "theoretical_loss": 3.832136974596506, + "tokens_seen": 611781632 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041142427281845535, + "loss": 3.0971, + "theoretical_loss": 3.8320942882037112, + "tokens_seen": 611847168 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004114142427281846, + "loss": 3.0028, + "theoretical_loss": 3.8320516076629474, + "tokens_seen": 611912704 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041140421263791377, + "loss": 2.9144, + "theoretical_loss": 3.832008932972786, + "tokens_seen": 611978240 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041139418254764295, + "loss": 3.089, + "theoretical_loss": 3.8319662641317986, + "tokens_seen": 612043776 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041138415245737213, + "loss": 3.0041, + "theoretical_loss": 3.8319236011385573, + "tokens_seen": 612109312 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004113741223671013, + "loss": 2.93, + "theoretical_loss": 3.831880943991634, + "tokens_seen": 612174848 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041136409227683055, + "loss": 3.1347, + "theoretical_loss": 3.8318382926896035, + "tokens_seen": 612240384 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041135406218655967, + "loss": 3.1261, + "theoretical_loss": 3.8317956472310373, + "tokens_seen": 612305920 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004113440320962889, + "loss": 3.0304, + "theoretical_loss": 3.8317530076145108, + "tokens_seen": 612371456 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041133400200601803, + "loss": 3.1802, + "theoretical_loss": 3.8317103738385985, + "tokens_seen": 612436992 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041132397191574727, + "loss": 3.036, + "theoretical_loss": 3.8316677459018758, + "tokens_seen": 612502528 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041131394182547645, + "loss": 2.9446, + "theoretical_loss": 3.831625123802917, + "tokens_seen": 612568064 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041130391173520563, + "loss": 2.8456, + "theoretical_loss": 3.8315825075402996, + "tokens_seen": 612633600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112938816449348, + "loss": 2.9578, + "theoretical_loss": 3.831539897112599, + "tokens_seen": 612699136 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 998554, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9871644973754883, + "objective/train/theoretical_loss": 3.8315079431201324, + "objective/train/tokens_used": 633208288, + "theoretical_loss": 3.8315079431201324, + "tokens_seen": 612748288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041128385155466405, + "loss": 3.0515, + "theoretical_loss": 3.831497292518393, + "tokens_seen": 612764672 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112738214643932, + "loss": 2.9643, + "theoretical_loss": 3.83145469375626, + "tokens_seen": 612830208 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112637913741224, + "loss": 3.036, + "theoretical_loss": 3.831412100824777, + "tokens_seen": 612895744 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041125376128385154, + "loss": 3.0408, + "theoretical_loss": 3.8313695137225223, + "tokens_seen": 612961280 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112437311935808, + "loss": 3.0311, + "theoretical_loss": 3.831326932448076, + "tokens_seen": 613026816 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041123370110330995, + "loss": 3.0392, + "theoretical_loss": 3.8312843570000172, + "tokens_seen": 613092352 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041122367101303914, + "loss": 2.8045, + "theoretical_loss": 3.8312417873769262, + "tokens_seen": 613157888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112136409227683, + "loss": 2.9349, + "theoretical_loss": 3.831199223577383, + "tokens_seen": 613223424 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004112036108324975, + "loss": 3.0427, + "theoretical_loss": 3.83115666559997, + "tokens_seen": 613288960 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111935807422267, + "loss": 2.8969, + "theoretical_loss": 3.8311141134432676, + "tokens_seen": 613354496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111835506519559, + "loss": 2.9732, + "theoretical_loss": 3.831071567105858, + "tokens_seen": 613420032 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041117352056168504, + "loss": 2.8737, + "theoretical_loss": 3.831029026586324, + "tokens_seen": 613485568 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111634904714143, + "loss": 3.0429, + "theoretical_loss": 3.830986491883249, + "tokens_seen": 613551104 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111534603811434, + "loss": 3.1107, + "theoretical_loss": 3.8309439629952156, + "tokens_seen": 613616640 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041114343029087264, + "loss": 3.1418, + "theoretical_loss": 3.8309014399208094, + "tokens_seen": 613682176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111334002006018, + "loss": 3.0228, + "theoretical_loss": 3.8308589226586136, + "tokens_seen": 613747712 + }, + { + "epoch": 1.08, + "learning_rate": 0.000411123370110331, + "loss": 3.0622, + "theoretical_loss": 3.8308164112072145, + "tokens_seen": 613813248 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111133400200602, + "loss": 3.0705, + "theoretical_loss": 3.830773905565196, + "tokens_seen": 613878784 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004111033099297894, + "loss": 2.9441, + "theoretical_loss": 3.8307314057311457, + "tokens_seen": 613944320 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041109327983951854, + "loss": 2.9733, + "theoretical_loss": 3.830688911703649, + "tokens_seen": 614009856 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110832497492478, + "loss": 2.9421, + "theoretical_loss": 3.8306464234812942, + "tokens_seen": 614075392 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110732196589769, + "loss": 2.9493, + "theoretical_loss": 3.8306039410626678, + "tokens_seen": 614140928 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041106318956870614, + "loss": 2.6624, + "theoretical_loss": 3.830561464446358, + "tokens_seen": 614206464 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110531594784353, + "loss": 2.9043, + "theoretical_loss": 3.830518993630954, + "tokens_seen": 614272000 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110431293881645, + "loss": 2.8925, + "theoretical_loss": 3.8304765286150437, + "tokens_seen": 614337536 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 1001400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.817751884460449, + "objective/train/theoretical_loss": 3.8304446836581807, + "objective/train/tokens_used": 634846688, + "theoretical_loss": 3.8304446836581807, + "tokens_seen": 614386688 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110330992978937, + "loss": 3.1141, + "theoretical_loss": 3.8304340693972176, + "tokens_seen": 614403072 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041102306920762287, + "loss": 2.9687, + "theoretical_loss": 3.8303916159760654, + "tokens_seen": 614468608 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041101303911735205, + "loss": 3.0187, + "theoretical_loss": 3.8303491683501774, + "tokens_seen": 614534144 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004110030090270813, + "loss": 2.9472, + "theoretical_loss": 3.8303067265181445, + "tokens_seen": 614599680 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004109929789368104, + "loss": 3.1324, + "theoretical_loss": 3.830264290478559, + "tokens_seen": 614665216 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041098294884653965, + "loss": 3.0493, + "theoretical_loss": 3.830221860230012, + "tokens_seen": 614730752 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041097291875626877, + "loss": 3.0609, + "theoretical_loss": 3.8301794357710963, + "tokens_seen": 614796288 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410962888665998, + "loss": 2.9278, + "theoretical_loss": 3.8301370171004048, + "tokens_seen": 614861824 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004109528585757272, + "loss": 2.986, + "theoretical_loss": 3.830094604216531, + "tokens_seen": 614927360 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041094282848545637, + "loss": 3.0171, + "theoretical_loss": 3.8300521971180688, + "tokens_seen": 614992896 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041093279839518555, + "loss": 2.9128, + "theoretical_loss": 3.830009795803613, + "tokens_seen": 615058432 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004109227683049148, + "loss": 2.9755, + "theoretical_loss": 3.829967400271758, + "tokens_seen": 615123968 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004109127382146439, + "loss": 3.1336, + "theoretical_loss": 3.8299250105211, + "tokens_seen": 615189504 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041090270812437315, + "loss": 2.9903, + "theoretical_loss": 3.8298826265502335, + "tokens_seen": 615255040 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004108926780341023, + "loss": 2.8911, + "theoretical_loss": 3.829840248357756, + "tokens_seen": 615320576 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004108826479438315, + "loss": 2.869, + "theoretical_loss": 3.8297978759422646, + "tokens_seen": 615386112 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004108726178535607, + "loss": 3.0514, + "theoretical_loss": 3.8297555093023554, + "tokens_seen": 615451648 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041086258776328987, + "loss": 2.9684, + "theoretical_loss": 3.8297131484366282, + "tokens_seen": 615517184 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041085255767301905, + "loss": 2.8943, + "theoretical_loss": 3.8296707933436798, + "tokens_seen": 615582720 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041084252758274823, + "loss": 3.1604, + "theoretical_loss": 3.8296284440221093, + "tokens_seen": 615648256 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004108324974924774, + "loss": 2.9569, + "theoretical_loss": 3.8295861004705163, + "tokens_seen": 615713792 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041082246740220665, + "loss": 2.8279, + "theoretical_loss": 3.829543762687501, + "tokens_seen": 615779328 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004108124373119358, + "loss": 2.9963, + "theoretical_loss": 3.829501430671663, + "tokens_seen": 615844864 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410802407221665, + "loss": 2.9896, + "theoretical_loss": 3.8294591044216038, + "tokens_seen": 615910400 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107923771313942, + "loss": 2.9821, + "theoretical_loss": 3.829416783935924, + "tokens_seen": 615975936 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 1002886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0785562992095947, + "objective/train/theoretical_loss": 3.8293850473536972, + "objective/train/tokens_used": 636485088, + "theoretical_loss": 3.8293850473536972, + "tokens_seen": 616025088 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107823470411234, + "loss": 3.0779, + "theoretical_loss": 3.8293744692132257, + "tokens_seen": 616041472 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041077231695085256, + "loss": 2.9559, + "theoretical_loss": 3.829332160252111, + "tokens_seen": 616107008 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041076228686058174, + "loss": 3.0394, + "theoretical_loss": 3.829289857051183, + "tokens_seen": 616172544 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107522567703109, + "loss": 3.0019, + "theoretical_loss": 3.8292475596090454, + "tokens_seen": 616238080 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041074222668004015, + "loss": 2.9541, + "theoretical_loss": 3.8292052679243005, + "tokens_seen": 616303616 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107321965897693, + "loss": 2.821, + "theoretical_loss": 3.8291629819955544, + "tokens_seen": 616369152 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107221664994985, + "loss": 2.9857, + "theoretical_loss": 3.82912070182141, + "tokens_seen": 616434688 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041071213640922764, + "loss": 2.9157, + "theoretical_loss": 3.829078427400473, + "tokens_seen": 616500224 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004107021063189569, + "loss": 2.9796, + "theoretical_loss": 3.82903615873135, + "tokens_seen": 616565760 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041069207622868606, + "loss": 2.9918, + "theoretical_loss": 3.828993895812646, + "tokens_seen": 616631296 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041068204613841524, + "loss": 3.0568, + "theoretical_loss": 3.8289516386429683, + "tokens_seen": 616696832 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004106720160481444, + "loss": 3.03, + "theoretical_loss": 3.828909387220924, + "tokens_seen": 616762368 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004106619859578736, + "loss": 2.936, + "theoretical_loss": 3.8288671415451208, + "tokens_seen": 616827904 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041065195586760284, + "loss": 3.0045, + "theoretical_loss": 3.8288249016141664, + "tokens_seen": 616893440 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410641925777332, + "loss": 2.9838, + "theoretical_loss": 3.82878266742667, + "tokens_seen": 616958976 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004106318956870612, + "loss": 3.1482, + "theoretical_loss": 3.8287404389812396, + "tokens_seen": 617024512 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004106218655967904, + "loss": 2.9304, + "theoretical_loss": 3.8286982162764858, + "tokens_seen": 617090048 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004106118355065196, + "loss": 2.9142, + "theoretical_loss": 3.8286559993110183, + "tokens_seen": 617155584 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041060180541624874, + "loss": 3.0832, + "theoretical_loss": 3.828613788083448, + "tokens_seen": 617221120 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410591775325978, + "loss": 3.0624, + "theoretical_loss": 3.828571582592385, + "tokens_seen": 617286656 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105817452357071, + "loss": 2.9972, + "theoretical_loss": 3.828529382836442, + "tokens_seen": 617352192 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041057171514543634, + "loss": 2.9236, + "theoretical_loss": 3.82848718881423, + "tokens_seen": 617417728 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105616850551655, + "loss": 2.9089, + "theoretical_loss": 3.8284450005243618, + "tokens_seen": 617483264 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105516549648947, + "loss": 2.8452, + "theoretical_loss": 3.8284028179654506, + "tokens_seen": 617548800 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105416248746239, + "loss": 3.0761, + "theoretical_loss": 3.8283606411361095, + "tokens_seen": 617614336 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 1005576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1454811096191406, + "objective/train/theoretical_loss": 3.8283290122733, + "objective/train/tokens_used": 638123488, + "theoretical_loss": 3.8283290122733, + "tokens_seen": 617663488 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041053159478435307, + "loss": 3.0899, + "theoretical_loss": 3.828318470034952, + "tokens_seen": 617679872 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041052156469408225, + "loss": 2.9838, + "theoretical_loss": 3.8282763046605934, + "tokens_seen": 617745408 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105115346038115, + "loss": 3.0412, + "theoretical_loss": 3.828234145011648, + "tokens_seen": 617810944 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004105015045135406, + "loss": 2.9035, + "theoretical_loss": 3.828191991086732, + "tokens_seen": 617876480 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041049147442326985, + "loss": 2.86, + "theoretical_loss": 3.8281498428844603, + "tokens_seen": 617942016 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041048144433299897, + "loss": 3.0557, + "theoretical_loss": 3.828107700403449, + "tokens_seen": 618007552 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004104714142427282, + "loss": 3.0982, + "theoretical_loss": 3.8280655636423164, + "tokens_seen": 618073088 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004104613841524574, + "loss": 3.0866, + "theoretical_loss": 3.828023432599678, + "tokens_seen": 618138624 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041045135406218657, + "loss": 2.8964, + "theoretical_loss": 3.827981307274152, + "tokens_seen": 618204160 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041044132397191575, + "loss": 2.991, + "theoretical_loss": 3.827939187664358, + "tokens_seen": 618269696 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410431293881645, + "loss": 2.9032, + "theoretical_loss": 3.8278970737689137, + "tokens_seen": 618335232 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004104212637913741, + "loss": 2.6669, + "theoretical_loss": 3.827854965586438, + "tokens_seen": 618400768 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041041123370110335, + "loss": 2.9569, + "theoretical_loss": 3.827812863115551, + "tokens_seen": 618466304 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004104012036108325, + "loss": 3.0492, + "theoretical_loss": 3.8277707663548726, + "tokens_seen": 618531840 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004103911735205617, + "loss": 3.0387, + "theoretical_loss": 3.8277286753030237, + "tokens_seen": 618597376 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004103811434302909, + "loss": 2.895, + "theoretical_loss": 3.827686589958626, + "tokens_seen": 618662912 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041037111334002007, + "loss": 2.9178, + "theoretical_loss": 3.8276445103203, + "tokens_seen": 618728448 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041036108324974925, + "loss": 2.9299, + "theoretical_loss": 3.827602436386668, + "tokens_seen": 618793984 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041035105315947844, + "loss": 3.0073, + "theoretical_loss": 3.8275603681563535, + "tokens_seen": 618859520 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004103410230692076, + "loss": 3.0095, + "theoretical_loss": 3.8275183056279785, + "tokens_seen": 618925056 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041033099297893685, + "loss": 2.8747, + "theoretical_loss": 3.827476248800167, + "tokens_seen": 618990592 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410320962888666, + "loss": 2.9539, + "theoretical_loss": 3.827434197671543, + "tokens_seen": 619056128 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004103109327983952, + "loss": 3.0231, + "theoretical_loss": 3.8273921522407304, + "tokens_seen": 619121664 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004103009027081244, + "loss": 2.9789, + "theoretical_loss": 3.8273501125063554, + "tokens_seen": 619187200 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004102908726178536, + "loss": 3.162, + "theoretical_loss": 3.827308078467042, + "tokens_seen": 619252736 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 1007992, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2259154319763184, + "objective/train/theoretical_loss": 3.827276556674115, + "objective/train/tokens_used": 639761888, + "theoretical_loss": 3.827276556674115, + "tokens_seen": 619301888 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041028084252758276, + "loss": 3.0172, + "theoretical_loss": 3.827266050121417, + "tokens_seen": 619318272 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041027081243731194, + "loss": 3.1406, + "theoretical_loss": 3.8272240274681066, + "tokens_seen": 619383808 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004102607823470411, + "loss": 3.0039, + "theoretical_loss": 3.8271820105057377, + "tokens_seen": 619449344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041025075225677035, + "loss": 2.9916, + "theoretical_loss": 3.8271399992329376, + "tokens_seen": 619514880 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004102407221664995, + "loss": 2.9713, + "theoretical_loss": 3.827097993648334, + "tokens_seen": 619580416 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004102306920762287, + "loss": 3.0316, + "theoretical_loss": 3.8270559937505553, + "tokens_seen": 619645952 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041022066198595784, + "loss": 2.875, + "theoretical_loss": 3.82701399953823, + "tokens_seen": 619711488 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004102106318956871, + "loss": 3.0742, + "theoretical_loss": 3.8269720110099876, + "tokens_seen": 619777024 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041020060180541626, + "loss": 3.0705, + "theoretical_loss": 3.8269300281644583, + "tokens_seen": 619842560 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041019057171514544, + "loss": 2.7926, + "theoretical_loss": 3.826888051000271, + "tokens_seen": 619908096 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004101805416248746, + "loss": 3.0748, + "theoretical_loss": 3.8268460795160575, + "tokens_seen": 619973632 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004101705115346038, + "loss": 3.1991, + "theoretical_loss": 3.8268041137104487, + "tokens_seen": 620039168 + }, + { + "epoch": 1.08, + "learning_rate": 0.000410160481444333, + "loss": 2.9819, + "theoretical_loss": 3.826762153582076, + "tokens_seen": 620104704 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004101504513540622, + "loss": 2.9804, + "theoretical_loss": 3.8267201991295714, + "tokens_seen": 620170240 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041014042126379135, + "loss": 2.9192, + "theoretical_loss": 3.826678250351568, + "tokens_seen": 620235776 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004101303911735206, + "loss": 3.1303, + "theoretical_loss": 3.826636307246698, + "tokens_seen": 620301312 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041012036108324976, + "loss": 2.9557, + "theoretical_loss": 3.8265943698135962, + "tokens_seen": 620366848 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041011033099297894, + "loss": 3.1545, + "theoretical_loss": 3.8265524380508955, + "tokens_seen": 620432384 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004101003009027081, + "loss": 2.8538, + "theoretical_loss": 3.8265105119572302, + "tokens_seen": 620497920 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100902708124373, + "loss": 3.1324, + "theoretical_loss": 3.826468591531236, + "tokens_seen": 620563456 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100802407221665, + "loss": 3.001, + "theoretical_loss": 3.826426676771548, + "tokens_seen": 620628992 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100702106318957, + "loss": 2.9697, + "theoretical_loss": 3.826384767676802, + "tokens_seen": 620694528 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041006018054162485, + "loss": 3.169, + "theoretical_loss": 3.8263428642456345, + "tokens_seen": 620760064 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100501504513541, + "loss": 2.9758, + "theoretical_loss": 3.8263009664766825, + "tokens_seen": 620825600 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100401203610832, + "loss": 2.9164, + "theoretical_loss": 3.8262590743685827, + "tokens_seen": 620891136 + }, + { + "epoch": 1.08, + "objective/train/docs_used": 1010658, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9779748916625977, + "objective/train/theoretical_loss": 3.826227659001623, + "objective/train/tokens_used": 641400288, + "theoretical_loss": 3.826227659001623, + "tokens_seen": 620940288 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041003009027081245, + "loss": 3.1209, + "theoretical_loss": 3.826217187919973, + "tokens_seen": 620956672 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041002006018054163, + "loss": 2.9284, + "theoretical_loss": 3.826175307129492, + "tokens_seen": 621022208 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004100100300902708, + "loss": 3.0038, + "theoretical_loss": 3.8261334319957787, + "tokens_seen": 621087744 + }, + { + "epoch": 1.08, + "learning_rate": 0.00041, + "loss": 3.005, + "theoretical_loss": 3.826091562517471, + "tokens_seen": 621153280 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040998996990972917, + "loss": 3.0893, + "theoretical_loss": 3.82604969869321, + "tokens_seen": 621218816 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040997993981945835, + "loss": 3.0471, + "theoretical_loss": 3.826007840521635, + "tokens_seen": 621284352 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004099699097291876, + "loss": 3.0709, + "theoretical_loss": 3.825965988001387, + "tokens_seen": 621349888 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004099598796389167, + "loss": 3.0722, + "theoretical_loss": 3.8259241411311065, + "tokens_seen": 621415424 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040994984954864595, + "loss": 2.9999, + "theoretical_loss": 3.8258822999094355, + "tokens_seen": 621480960 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040993981945837513, + "loss": 3.0037, + "theoretical_loss": 3.825840464335016, + "tokens_seen": 621546496 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004099297893681043, + "loss": 3.0818, + "theoretical_loss": 3.8257986344064903, + "tokens_seen": 621612032 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004099197592778335, + "loss": 2.9584, + "theoretical_loss": 3.8257568101225012, + "tokens_seen": 621677568 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004099097291875627, + "loss": 2.8665, + "theoretical_loss": 3.825714991481693, + "tokens_seen": 621743104 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004098996990972919, + "loss": 2.9426, + "theoretical_loss": 3.825673178482708, + "tokens_seen": 621808640 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004098896690070211, + "loss": 3.0847, + "theoretical_loss": 3.8256313711241914, + "tokens_seen": 621874176 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004098796389167503, + "loss": 2.9801, + "theoretical_loss": 3.825589569404789, + "tokens_seen": 621939712 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040986960882647945, + "loss": 2.9626, + "theoretical_loss": 3.8255477733231444, + "tokens_seen": 622005248 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040985957873620864, + "loss": 3.1363, + "theoretical_loss": 3.825505982877904, + "tokens_seen": 622070784 + }, + { + "epoch": 1.08, + "learning_rate": 0.0004098495486459378, + "loss": 3.1317, + "theoretical_loss": 3.8254641980677144, + "tokens_seen": 622136320 + }, + { + "epoch": 1.08, + "learning_rate": 0.00040983951855566705, + "loss": 2.9657, + "theoretical_loss": 3.8254224188912223, + "tokens_seen": 622201856 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004098294884653962, + "loss": 2.9002, + "theoretical_loss": 3.8253806453470744, + "tokens_seen": 622267392 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004098194583751254, + "loss": 2.9032, + "theoretical_loss": 3.825338877433918, + "tokens_seen": 622332928 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004098094282848546, + "loss": 2.7879, + "theoretical_loss": 3.825297115150402, + "tokens_seen": 622398464 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004097993981945838, + "loss": 2.9399, + "theoretical_loss": 3.8252553584951743, + "tokens_seen": 622464000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040978936810431296, + "loss": 3.0939, + "theoretical_loss": 3.8252136074668845, + "tokens_seen": 622529536 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1013555, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.733370065689087, + "objective/train/theoretical_loss": 3.8251822978875327, + "objective/train/tokens_used": 643038688, + "theoretical_loss": 3.8251822978875327, + "tokens_seen": 622578688 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040977933801404214, + "loss": 2.9161, + "theoretical_loss": 3.825171862064182, + "tokens_seen": 622595072 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004097693079237713, + "loss": 2.8532, + "theoretical_loss": 3.8251301222857164, + "tokens_seen": 622660608 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040975927783350055, + "loss": 2.9665, + "theoretical_loss": 3.8250883881301387, + "tokens_seen": 622726144 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004097492477432297, + "loss": 2.8283, + "theoretical_loss": 3.8250466595960986, + "tokens_seen": 622791680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004097392176529589, + "loss": 2.974, + "theoretical_loss": 3.825004936682249, + "tokens_seen": 622857216 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040972918756268804, + "loss": 2.9141, + "theoretical_loss": 3.8249632193872403, + "tokens_seen": 622922752 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004097191574724173, + "loss": 2.9542, + "theoretical_loss": 3.8249215077097265, + "tokens_seen": 622988288 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040970912738214646, + "loss": 2.9776, + "theoretical_loss": 3.8248798016483585, + "tokens_seen": 623053824 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040969909729187564, + "loss": 2.9219, + "theoretical_loss": 3.8248381012017907, + "tokens_seen": 623119360 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004096890672016048, + "loss": 2.8302, + "theoretical_loss": 3.8247964063686757, + "tokens_seen": 623184896 + }, + { + "epoch": 1.09, + "learning_rate": 0.000409679037111334, + "loss": 2.9391, + "theoretical_loss": 3.8247547171476692, + "tokens_seen": 623250432 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004096690070210632, + "loss": 3.2108, + "theoretical_loss": 3.8247130335374244, + "tokens_seen": 623315968 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004096589769307924, + "loss": 3.0568, + "theoretical_loss": 3.824671355536597, + "tokens_seen": 623381504 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040964894684052155, + "loss": 2.9195, + "theoretical_loss": 3.8246296831438427, + "tokens_seen": 623447040 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004096389167502508, + "loss": 3.0045, + "theoretical_loss": 3.824588016357817, + "tokens_seen": 623512576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040962888665997996, + "loss": 2.9461, + "theoretical_loss": 3.8245463551771772, + "tokens_seen": 623578112 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040961885656970914, + "loss": 2.9125, + "theoretical_loss": 3.8245046996005794, + "tokens_seen": 623643648 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004096088264794383, + "loss": 3.0042, + "theoretical_loss": 3.824463049626681, + "tokens_seen": 623709184 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095987963891675, + "loss": 2.9502, + "theoretical_loss": 3.82442140525414, + "tokens_seen": 623774720 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095887662988967, + "loss": 3.1313, + "theoretical_loss": 3.824379766481615, + "tokens_seen": 623840256 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095787362086259, + "loss": 2.8429, + "theoretical_loss": 3.824338133307765, + "tokens_seen": 623905792 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040956870611835505, + "loss": 3.0737, + "theoretical_loss": 3.8242965057312484, + "tokens_seen": 623971328 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095586760280843, + "loss": 3.0588, + "theoretical_loss": 3.8242548837507253, + "tokens_seen": 624036864 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095486459378134, + "loss": 2.8793, + "theoretical_loss": 3.824213267364856, + "tokens_seen": 624102400 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040953861584754265, + "loss": 2.9933, + "theoretical_loss": 3.8241716565723003, + "tokens_seen": 624167936 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1016336, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8383963108062744, + "objective/train/theoretical_loss": 3.824140452147691, + "objective/train/tokens_used": 644677088, + "theoretical_loss": 3.824140452147691, + "tokens_seen": 624217088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040952858575727183, + "loss": 2.8796, + "theoretical_loss": 3.824130051371721, + "tokens_seen": 624233472 + }, + { + "epoch": 1.09, + "learning_rate": 0.000409518555667001, + "loss": 3.033, + "theoretical_loss": 3.824088451761778, + "tokens_seen": 624299008 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004095085255767302, + "loss": 2.8586, + "theoretical_loss": 3.824046857741134, + "tokens_seen": 624364544 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040949849548645937, + "loss": 2.9302, + "theoretical_loss": 3.8240052693084516, + "tokens_seen": 624430080 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040948846539618855, + "loss": 3.1976, + "theoretical_loss": 3.8239636864623936, + "tokens_seen": 624495616 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004094784353059178, + "loss": 2.9775, + "theoretical_loss": 3.8239221092016233, + "tokens_seen": 624561152 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004094684052156469, + "loss": 2.9845, + "theoretical_loss": 3.823880537524804, + "tokens_seen": 624626688 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040945837512537615, + "loss": 2.9222, + "theoretical_loss": 3.823838971430601, + "tokens_seen": 624692224 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040944834503510533, + "loss": 3.0215, + "theoretical_loss": 3.8237974109176793, + "tokens_seen": 624757760 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004094383149448345, + "loss": 2.931, + "theoretical_loss": 3.823755855984703, + "tokens_seen": 624823296 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004094282848545637, + "loss": 3.045, + "theoretical_loss": 3.823714306630338, + "tokens_seen": 624888832 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004094182547642929, + "loss": 2.8875, + "theoretical_loss": 3.8236727628532505, + "tokens_seen": 624954368 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040940822467402206, + "loss": 2.9387, + "theoretical_loss": 3.8236312246521074, + "tokens_seen": 625019904 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093981945837513, + "loss": 3.1388, + "theoretical_loss": 3.8235896920255756, + "tokens_seen": 625085440 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093881644934804, + "loss": 2.9819, + "theoretical_loss": 3.823548164972323, + "tokens_seen": 625150976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040937813440320965, + "loss": 3.0518, + "theoretical_loss": 3.823506643491017, + "tokens_seen": 625216512 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093681043129388, + "loss": 2.9969, + "theoretical_loss": 3.8234651275803264, + "tokens_seen": 625282048 + }, + { + "epoch": 1.09, + "learning_rate": 0.000409358074222668, + "loss": 2.8625, + "theoretical_loss": 3.8234236172389204, + "tokens_seen": 625347584 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093480441323972, + "loss": 3.0753, + "theoretical_loss": 3.8233821124654677, + "tokens_seen": 625413120 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093380140421264, + "loss": 3.0999, + "theoretical_loss": 3.8233406132586376, + "tokens_seen": 625478656 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040932798395185556, + "loss": 3.0105, + "theoretical_loss": 3.823299119617102, + "tokens_seen": 625544192 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093179538615848, + "loss": 3.0352, + "theoretical_loss": 3.8232576315395304, + "tokens_seen": 625609728 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004093079237713139, + "loss": 2.9329, + "theoretical_loss": 3.823216149024594, + "tokens_seen": 625675264 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040929789368104316, + "loss": 2.9528, + "theoretical_loss": 3.823174672070965, + "tokens_seen": 625740800 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092878635907723, + "loss": 2.9942, + "theoretical_loss": 3.823133200677316, + "tokens_seen": 625806336 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1019051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0032596588134766, + "objective/train/theoretical_loss": 3.823102100780016, + "objective/train/tokens_used": 646315488, + "theoretical_loss": 3.823102100780016, + "tokens_seen": 625855488 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092778335005015, + "loss": 2.9535, + "theoretical_loss": 3.8230917348423175, + "tokens_seen": 625871872 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092678034102307, + "loss": 3.1058, + "theoretical_loss": 3.8230502745646446, + "tokens_seen": 625937408 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092577733199599, + "loss": 3.1235, + "theoretical_loss": 3.8230088198429697, + "tokens_seen": 626002944 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040924774322968906, + "loss": 2.9499, + "theoretical_loss": 3.8229673706759666, + "tokens_seen": 626068480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040923771313941824, + "loss": 3.0921, + "theoretical_loss": 3.82292592706231, + "tokens_seen": 626134016 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092276830491474, + "loss": 3.1164, + "theoretical_loss": 3.8228844890006757, + "tokens_seen": 626199552 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040921765295887666, + "loss": 2.867, + "theoretical_loss": 3.822843056489737, + "tokens_seen": 626265088 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004092076228686058, + "loss": 2.946, + "theoretical_loss": 3.8228016295281715, + "tokens_seen": 626330624 + }, + { + "epoch": 1.09, + "learning_rate": 0.000409197592778335, + "loss": 2.9471, + "theoretical_loss": 3.822760208114654, + "tokens_seen": 626396160 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040918756268806415, + "loss": 2.9033, + "theoretical_loss": 3.822718792247862, + "tokens_seen": 626461696 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004091775325977934, + "loss": 3.093, + "theoretical_loss": 3.8226773819264723, + "tokens_seen": 626527232 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040916750250752257, + "loss": 3.0445, + "theoretical_loss": 3.8226359771491625, + "tokens_seen": 626592768 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040915747241725175, + "loss": 2.9474, + "theoretical_loss": 3.82259457791461, + "tokens_seen": 626658304 + }, + { + "epoch": 1.09, + "learning_rate": 0.000409147442326981, + "loss": 3.124, + "theoretical_loss": 3.8225531842214946, + "tokens_seen": 626723840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040913741223671016, + "loss": 2.9494, + "theoretical_loss": 3.8225117960684942, + "tokens_seen": 626789376 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040912738214643934, + "loss": 3.1632, + "theoretical_loss": 3.8224704134542877, + "tokens_seen": 626854912 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004091173520561685, + "loss": 3.0652, + "theoretical_loss": 3.8224290363775566, + "tokens_seen": 626920448 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004091073219658977, + "loss": 3.0392, + "theoretical_loss": 3.8223876648369792, + "tokens_seen": 626985984 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090972918756269, + "loss": 2.9399, + "theoretical_loss": 3.8223462988312376, + "tokens_seen": 627051520 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090872617853561, + "loss": 2.9804, + "theoretical_loss": 3.8223049383590126, + "tokens_seen": 627117056 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040907723169508525, + "loss": 3.1641, + "theoretical_loss": 3.8222635834189864, + "tokens_seen": 627182592 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090672016048145, + "loss": 3.1328, + "theoretical_loss": 3.82222223400984, + "tokens_seen": 627248128 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090571715145436, + "loss": 2.9586, + "theoretical_loss": 3.822180890130256, + "tokens_seen": 627313664 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040904714142427285, + "loss": 3.0728, + "theoretical_loss": 3.8221395517789185, + "tokens_seen": 627379200 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040903711133400203, + "loss": 2.9611, + "theoretical_loss": 3.82209821895451, + "tokens_seen": 627444736 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1020436, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1370832920074463, + "objective/train/theoretical_loss": 3.822067222962459, + "objective/train/tokens_used": 647953888, + "theoretical_loss": 3.822067222962459, + "tokens_seen": 627493888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090270812437312, + "loss": 2.9739, + "theoretical_loss": 3.8220568916557145, + "tokens_seen": 627510272 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004090170511534604, + "loss": 2.9909, + "theoretical_loss": 3.8220155698812164, + "tokens_seen": 627575808 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040900702106318957, + "loss": 3.1376, + "theoretical_loss": 3.8219742536297003, + "tokens_seen": 627641344 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040899699097291875, + "loss": 2.8929, + "theoretical_loss": 3.821932942899852, + "tokens_seen": 627706880 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408986960882648, + "loss": 2.9869, + "theoretical_loss": 3.821891637690357, + "tokens_seen": 627772416 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004089769307923771, + "loss": 2.9913, + "theoretical_loss": 3.8218503379999014, + "tokens_seen": 627837952 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040896690070210635, + "loss": 3.045, + "theoretical_loss": 3.821809043827171, + "tokens_seen": 627903488 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040895687061183553, + "loss": 3.0222, + "theoretical_loss": 3.8217677551708538, + "tokens_seen": 627969024 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004089468405215647, + "loss": 2.9274, + "theoretical_loss": 3.821726472029637, + "tokens_seen": 628034560 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004089368104312939, + "loss": 2.8704, + "theoretical_loss": 3.8216851944022086, + "tokens_seen": 628100096 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004089267803410231, + "loss": 2.9921, + "theoretical_loss": 3.821643922287257, + "tokens_seen": 628165632 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040891675025075226, + "loss": 3.068, + "theoretical_loss": 3.821602655683471, + "tokens_seen": 628231168 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004089067201604815, + "loss": 2.9546, + "theoretical_loss": 3.8215613945895393, + "tokens_seen": 628296704 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004088966900702106, + "loss": 3.0735, + "theoretical_loss": 3.8215201390041527, + "tokens_seen": 628362240 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040888665997993985, + "loss": 2.9453, + "theoretical_loss": 3.8214788889260007, + "tokens_seen": 628427776 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408876629889669, + "loss": 2.9569, + "theoretical_loss": 3.8214376443537743, + "tokens_seen": 628493312 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004088665997993982, + "loss": 2.9606, + "theoretical_loss": 3.8213964052861638, + "tokens_seen": 628558848 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004088565697091274, + "loss": 2.9346, + "theoretical_loss": 3.8213551717218612, + "tokens_seen": 628624384 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004088465396188566, + "loss": 3.0246, + "theoretical_loss": 3.821313943659559, + "tokens_seen": 628689920 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040883650952858576, + "loss": 3.156, + "theoretical_loss": 3.821272721097949, + "tokens_seen": 628755456 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408826479438315, + "loss": 2.9213, + "theoretical_loss": 3.821231504035724, + "tokens_seen": 628820992 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004088164493480441, + "loss": 2.954, + "theoretical_loss": 3.8211902924715777, + "tokens_seen": 628886528 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040880641925777336, + "loss": 3.0156, + "theoretical_loss": 3.8211490864042035, + "tokens_seen": 628952064 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087963891675025, + "loss": 3.0283, + "theoretical_loss": 3.821107885832296, + "tokens_seen": 629017600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087863590772317, + "loss": 2.962, + "theoretical_loss": 3.821066690754549, + "tokens_seen": 629083136 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1023249, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7771530151367188, + "objective/train/theoretical_loss": 3.821035798050998, + "objective/train/tokens_used": 649592288, + "theoretical_loss": 3.821035798050998, + "tokens_seen": 629132288 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087763289869609, + "loss": 2.8503, + "theoretical_loss": 3.821025501169659, + "tokens_seen": 629148672 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087662988966901, + "loss": 2.9836, + "theoretical_loss": 3.8209843170763205, + "tokens_seen": 629214208 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040875626880641926, + "loss": 2.974, + "theoretical_loss": 3.82094313847323, + "tokens_seen": 629279744 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040874623871614844, + "loss": 3.0218, + "theoretical_loss": 3.820901965359083, + "tokens_seen": 629345280 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087362086258776, + "loss": 2.9434, + "theoretical_loss": 3.8208607977325775, + "tokens_seen": 629410816 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040872617853560686, + "loss": 3.1159, + "theoretical_loss": 3.82081963559241, + "tokens_seen": 629476352 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408716148445336, + "loss": 3.0625, + "theoretical_loss": 3.8207784789372785, + "tokens_seen": 629541888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004087061183550652, + "loss": 3.0017, + "theoretical_loss": 3.820737327765882, + "tokens_seen": 629607424 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040869608826479435, + "loss": 3.1332, + "theoretical_loss": 3.820696182076918, + "tokens_seen": 629672960 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004086860581745236, + "loss": 2.9601, + "theoretical_loss": 3.8206550418690863, + "tokens_seen": 629738496 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040867602808425277, + "loss": 2.7847, + "theoretical_loss": 3.820613907141086, + "tokens_seen": 629804032 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040866599799398195, + "loss": 2.8709, + "theoretical_loss": 3.820572777891617, + "tokens_seen": 629869568 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040865596790371113, + "loss": 2.9937, + "theoretical_loss": 3.820531654119381, + "tokens_seen": 629935104 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040864593781344036, + "loss": 2.8966, + "theoretical_loss": 3.820490535823078, + "tokens_seen": 630000640 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004086359077231695, + "loss": 2.9883, + "theoretical_loss": 3.820449423001408, + "tokens_seen": 630066176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004086258776328987, + "loss": 3.0405, + "theoretical_loss": 3.820408315653075, + "tokens_seen": 630131712 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040861584754262785, + "loss": 2.999, + "theoretical_loss": 3.8203672137767795, + "tokens_seen": 630197248 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004086058174523571, + "loss": 3.176, + "theoretical_loss": 3.8203261173712253, + "tokens_seen": 630262784 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040859578736208627, + "loss": 2.9107, + "theoretical_loss": 3.820285026435115, + "tokens_seen": 630328320 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040858575727181545, + "loss": 3.0169, + "theoretical_loss": 3.8202439409671523, + "tokens_seen": 630393856 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040857572718154463, + "loss": 2.9561, + "theoretical_loss": 3.820202860966041, + "tokens_seen": 630459392 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004085656970912738, + "loss": 2.8702, + "theoretical_loss": 3.8201617864304853, + "tokens_seen": 630524928 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408555667001003, + "loss": 3.0769, + "theoretical_loss": 3.8201207173591905, + "tokens_seen": 630590464 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040854563691073223, + "loss": 3.0772, + "theoretical_loss": 3.8200796537508612, + "tokens_seen": 630656000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040853560682046136, + "loss": 2.8275, + "theoretical_loss": 3.8200385956042044, + "tokens_seen": 630721536 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1026131, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9523565769195557, + "objective/train/theoretical_loss": 3.8200078055776556, + "objective/train/tokens_used": 651230688, + "theoretical_loss": 3.8200078055776556, + "tokens_seen": 630770688 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004085255767301906, + "loss": 2.9977, + "theoretical_loss": 3.8199975429179256, + "tokens_seen": 630787072 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004085155466399197, + "loss": 3.1022, + "theoretical_loss": 3.8199564956907306, + "tokens_seen": 630852608 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040850551654964895, + "loss": 3.0991, + "theoretical_loss": 3.819915453921328, + "tokens_seen": 630918144 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040849548645937814, + "loss": 2.9219, + "theoretical_loss": 3.819874417608424, + "tokens_seen": 630983680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004084854563691073, + "loss": 3.0485, + "theoretical_loss": 3.819833386750727, + "tokens_seen": 631049216 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004084754262788365, + "loss": 3.005, + "theoretical_loss": 3.8197923613469458, + "tokens_seen": 631114752 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040846539618856573, + "loss": 3.0151, + "theoretical_loss": 3.8197513413957886, + "tokens_seen": 631180288 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040845536609829486, + "loss": 2.9197, + "theoretical_loss": 3.819710326895965, + "tokens_seen": 631245824 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004084453360080241, + "loss": 3.0162, + "theoretical_loss": 3.8196693178461847, + "tokens_seen": 631311360 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004084353059177532, + "loss": 3.0524, + "theoretical_loss": 3.8196283142451577, + "tokens_seen": 631376896 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040842527582748246, + "loss": 2.8588, + "theoretical_loss": 3.819587316091595, + "tokens_seen": 631442432 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040841524573721164, + "loss": 3.084, + "theoretical_loss": 3.8195463233842064, + "tokens_seen": 631507968 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004084052156469408, + "loss": 2.8722, + "theoretical_loss": 3.819505336121705, + "tokens_seen": 631573504 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040839518555667005, + "loss": 2.8758, + "theoretical_loss": 3.8194643543028013, + "tokens_seen": 631639040 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083851554663992, + "loss": 3.1534, + "theoretical_loss": 3.819423377926209, + "tokens_seen": 631704576 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083751253761284, + "loss": 2.97, + "theoretical_loss": 3.8193824069906395, + "tokens_seen": 631770112 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083650952858576, + "loss": 2.8138, + "theoretical_loss": 3.8193414414948066, + "tokens_seen": 631835648 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083550651955868, + "loss": 3.0958, + "theoretical_loss": 3.8193004814374243, + "tokens_seen": 631901184 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040834503510531596, + "loss": 3.1983, + "theoretical_loss": 3.8192595268172065, + "tokens_seen": 631966720 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083350050150452, + "loss": 3.0944, + "theoretical_loss": 3.819218577632868, + "tokens_seen": 632032256 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083249749247743, + "loss": 3.011, + "theoretical_loss": 3.8191776338831227, + "tokens_seen": 632097792 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040831494483450356, + "loss": 3.0517, + "theoretical_loss": 3.8191366955666863, + "tokens_seen": 632163328 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004083049147442327, + "loss": 3.0151, + "theoretical_loss": 3.8190957626822755, + "tokens_seen": 632228864 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082948846539619, + "loss": 3.0988, + "theoretical_loss": 3.819054835228606, + "tokens_seen": 632294400 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082848545636911, + "loss": 3.1272, + "theoretical_loss": 3.8190139132043948, + "tokens_seen": 632359936 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1029177, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3009824752807617, + "objective/train/theoretical_loss": 3.8189832252485463, + "objective/train/tokens_used": 652869088, + "theoretical_loss": 3.8189832252485463, + "tokens_seen": 632409088 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082748244734203, + "loss": 3.0431, + "theoretical_loss": 3.818972996608358, + "tokens_seen": 632425472 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040826479438314946, + "loss": 3.0359, + "theoretical_loss": 3.818932085439215, + "tokens_seen": 632491008 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040825476429287864, + "loss": 2.9691, + "theoretical_loss": 3.818891179695682, + "tokens_seen": 632556544 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082447342026078, + "loss": 3.0144, + "theoretical_loss": 3.8188502793764787, + "tokens_seen": 632622080 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040823470411233706, + "loss": 3.172, + "theoretical_loss": 3.818809384480324, + "tokens_seen": 632687616 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082246740220662, + "loss": 3.0755, + "theoretical_loss": 3.818768495005936, + "tokens_seen": 632753152 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004082146439317954, + "loss": 2.9672, + "theoretical_loss": 3.8187276109520356, + "tokens_seen": 632818688 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040820461384152455, + "loss": 3.0034, + "theoretical_loss": 3.8186867323173423, + "tokens_seen": 632884224 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004081945837512538, + "loss": 3.0795, + "theoretical_loss": 3.818645859100577, + "tokens_seen": 632949760 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040818455366098297, + "loss": 3.0579, + "theoretical_loss": 3.818604991300461, + "tokens_seen": 633015296 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040817452357071215, + "loss": 2.9315, + "theoretical_loss": 3.8185641289157153, + "tokens_seen": 633080832 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040816449348044133, + "loss": 2.9828, + "theoretical_loss": 3.818523271945063, + "tokens_seen": 633146368 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040815446339017056, + "loss": 2.981, + "theoretical_loss": 3.818482420387225, + "tokens_seen": 633211904 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004081444332998997, + "loss": 2.9455, + "theoretical_loss": 3.818441574240924, + "tokens_seen": 633277440 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004081344032096289, + "loss": 2.8367, + "theoretical_loss": 3.818400733504885, + "tokens_seen": 633342976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040812437311935805, + "loss": 2.9042, + "theoretical_loss": 3.81835989817783, + "tokens_seen": 633408512 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004081143430290873, + "loss": 3.0302, + "theoretical_loss": 3.818319068258484, + "tokens_seen": 633474048 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040810431293881647, + "loss": 3.065, + "theoretical_loss": 3.8182782437455707, + "tokens_seen": 633539584 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040809428284854565, + "loss": 2.9331, + "theoretical_loss": 3.8182374246378155, + "tokens_seen": 633605120 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040808425275827483, + "loss": 2.9241, + "theoretical_loss": 3.8181966109339447, + "tokens_seen": 633670656 + }, + { + "epoch": 1.09, + "learning_rate": 0.000408074222668004, + "loss": 2.9072, + "theoretical_loss": 3.8181558026326825, + "tokens_seen": 633736192 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004080641925777332, + "loss": 2.9376, + "theoretical_loss": 3.8181149997327566, + "tokens_seen": 633801728 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040805416248746243, + "loss": 3.023, + "theoretical_loss": 3.818074202232893, + "tokens_seen": 633867264 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040804413239719156, + "loss": 3.0847, + "theoretical_loss": 3.818033410131818, + "tokens_seen": 633932800 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004080341023069208, + "loss": 2.91, + "theoretical_loss": 3.8179926234282604, + "tokens_seen": 633998336 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1032058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.287862777709961, + "objective/train/theoretical_loss": 3.817962036941948, + "objective/train/tokens_used": 654507488, + "theoretical_loss": 3.817962036941948, + "tokens_seen": 634047488 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004080240722166499, + "loss": 3.1734, + "theoretical_loss": 3.817951842120948, + "tokens_seen": 634063872 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040801404212637915, + "loss": 2.9534, + "theoretical_loss": 3.817911066208609, + "tokens_seen": 634129408 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040800401203610834, + "loss": 3.1266, + "theoretical_loss": 3.817870295689972, + "tokens_seen": 634194944 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004079939819458375, + "loss": 2.9256, + "theoretical_loss": 3.817829530563767, + "tokens_seen": 634260480 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004079839518555667, + "loss": 3.0431, + "theoretical_loss": 3.817788770828723, + "tokens_seen": 634326016 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040797392176529593, + "loss": 3.0693, + "theoretical_loss": 3.81774801648357, + "tokens_seen": 634391552 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040796389167502506, + "loss": 3.0739, + "theoretical_loss": 3.81770726752704, + "tokens_seen": 634457088 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004079538615847543, + "loss": 2.8141, + "theoretical_loss": 3.817666523957862, + "tokens_seen": 634522624 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004079438314944834, + "loss": 3.1013, + "theoretical_loss": 3.8176257857747684, + "tokens_seen": 634588160 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040793380140421266, + "loss": 3.0732, + "theoretical_loss": 3.817585052976492, + "tokens_seen": 634653696 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040792377131394184, + "loss": 3.0377, + "theoretical_loss": 3.817544325561763, + "tokens_seen": 634719232 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407913741223671, + "loss": 3.1305, + "theoretical_loss": 3.8175036035293157, + "tokens_seen": 634784768 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004079037111334002, + "loss": 2.9945, + "theoretical_loss": 3.817462886877883, + "tokens_seen": 634850304 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078936810431294, + "loss": 3.0824, + "theoretical_loss": 3.817422175606198, + "tokens_seen": 634915840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040788365095285856, + "loss": 2.9728, + "theoretical_loss": 3.817381469712995, + "tokens_seen": 634981376 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078736208625878, + "loss": 3.0274, + "theoretical_loss": 3.817340769197009, + "tokens_seen": 635046912 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078635907723169, + "loss": 3.053, + "theoretical_loss": 3.8173000740569734, + "tokens_seen": 635112448 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040785356068204616, + "loss": 3.007, + "theoretical_loss": 3.817259384291625, + "tokens_seen": 635177984 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078435305917753, + "loss": 2.8441, + "theoretical_loss": 3.8172186998996986, + "tokens_seen": 635243520 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078335005015045, + "loss": 2.9143, + "theoretical_loss": 3.8171780208799304, + "tokens_seen": 635309056 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078234704112337, + "loss": 3.033, + "theoretical_loss": 3.817137347231058, + "tokens_seen": 635374592 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004078134403209629, + "loss": 2.9875, + "theoretical_loss": 3.817096678951817, + "tokens_seen": 635440128 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040780341023069207, + "loss": 2.9904, + "theoretical_loss": 3.8170560160409455, + "tokens_seen": 635505664 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004077933801404213, + "loss": 2.9841, + "theoretical_loss": 3.8170153584971813, + "tokens_seen": 635571200 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040778335005015043, + "loss": 2.9752, + "theoretical_loss": 3.8169747063192627, + "tokens_seen": 635636736 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1034455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.931568145751953, + "objective/train/theoretical_loss": 3.816944220706401, + "objective/train/tokens_used": 656145888, + "theoretical_loss": 3.816944220706401, + "tokens_seen": 635685888 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040777331995987966, + "loss": 2.9751, + "theoretical_loss": 3.8169340595059285, + "tokens_seen": 635702272 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004077632898696088, + "loss": 3.0928, + "theoretical_loss": 3.8168934180559173, + "tokens_seen": 635767808 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407753259779338, + "loss": 2.9581, + "theoretical_loss": 3.8168527819679694, + "tokens_seen": 635833344 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004077432296890672, + "loss": 3.0901, + "theoretical_loss": 3.816812151240825, + "tokens_seen": 635898880 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004077331995987964, + "loss": 2.9364, + "theoretical_loss": 3.8167715258732233, + "tokens_seen": 635964416 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040772316950852557, + "loss": 2.9905, + "theoretical_loss": 3.8167309058639063, + "tokens_seen": 636029952 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040771313941825475, + "loss": 3.0537, + "theoretical_loss": 3.8166902912116143, + "tokens_seen": 636095488 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040770310932798393, + "loss": 2.9492, + "theoretical_loss": 3.8166496819150897, + "tokens_seen": 636161024 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040769307923771317, + "loss": 3.0276, + "theoretical_loss": 3.816609077973075, + "tokens_seen": 636226560 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004076830491474423, + "loss": 2.897, + "theoretical_loss": 3.816568479384311, + "tokens_seen": 636292096 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040767301905717153, + "loss": 3.0757, + "theoretical_loss": 3.8165278861475422, + "tokens_seen": 636357632 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004076629889669007, + "loss": 3.1052, + "theoretical_loss": 3.816487298261512, + "tokens_seen": 636423168 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004076529588766299, + "loss": 3.0274, + "theoretical_loss": 3.8164467157249633, + "tokens_seen": 636488704 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004076429287863591, + "loss": 2.8568, + "theoretical_loss": 3.8164061385366415, + "tokens_seen": 636554240 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040763289869608825, + "loss": 3.0472, + "theoretical_loss": 3.81636556669529, + "tokens_seen": 636619776 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004076228686058175, + "loss": 3.0081, + "theoretical_loss": 3.8163250001996545, + "tokens_seen": 636685312 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040761283851554667, + "loss": 3.0794, + "theoretical_loss": 3.8162844390484807, + "tokens_seen": 636750848 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040760280842527585, + "loss": 3.176, + "theoretical_loss": 3.8162438832405146, + "tokens_seen": 636816384 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040759277833500503, + "loss": 3.0729, + "theoretical_loss": 3.816203332774502, + "tokens_seen": 636881920 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004075827482447342, + "loss": 2.898, + "theoretical_loss": 3.8161627876491897, + "tokens_seen": 636947456 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004075727181544634, + "loss": 3.1573, + "theoretical_loss": 3.816122247863326, + "tokens_seen": 637012992 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040756268806419263, + "loss": 2.9748, + "theoretical_loss": 3.816081713415657, + "tokens_seen": 637078528 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040755265797392176, + "loss": 3.2299, + "theoretical_loss": 3.8160411843049316, + "tokens_seen": 637144064 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407542627883651, + "loss": 2.9795, + "theoretical_loss": 3.816000660529898, + "tokens_seen": 637209600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004075325977933801, + "loss": 2.812, + "theoretical_loss": 3.8159601420893052, + "tokens_seen": 637275136 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1037232, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9832491874694824, + "objective/train/theoretical_loss": 3.8159297567588357, + "objective/train/tokens_used": 657784288, + "theoretical_loss": 3.8159297567588357, + "tokens_seen": 637324288 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040752256770310935, + "loss": 3.0708, + "theoretical_loss": 3.815919628981903, + "tokens_seen": 637340672 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040751253761283854, + "loss": 3.1002, + "theoretical_loss": 3.8158791212064402, + "tokens_seen": 637406208 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004075025075225677, + "loss": 3.1542, + "theoretical_loss": 3.815838618761668, + "tokens_seen": 637471744 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074924774322969, + "loss": 3.0671, + "theoretical_loss": 3.8157981216463357, + "tokens_seen": 637537280 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040748244734202613, + "loss": 3.0489, + "theoretical_loss": 3.815757629859195, + "tokens_seen": 637602816 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040747241725175526, + "loss": 2.7819, + "theoretical_loss": 3.815717143398998, + "tokens_seen": 637668352 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074623871614845, + "loss": 2.9952, + "theoretical_loss": 3.815676662264495, + "tokens_seen": 637733888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074523570712136, + "loss": 3.0494, + "theoretical_loss": 3.81563618645444, + "tokens_seen": 637799424 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040744232698094286, + "loss": 3.1489, + "theoretical_loss": 3.8155957159675844, + "tokens_seen": 637864960 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040743229689067204, + "loss": 2.8307, + "theoretical_loss": 3.8155552508026815, + "tokens_seen": 637930496 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074222668004012, + "loss": 2.8516, + "theoretical_loss": 3.8155147909584857, + "tokens_seen": 637996032 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074122367101304, + "loss": 3.0053, + "theoretical_loss": 3.8154743364337493, + "tokens_seen": 638061568 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004074022066198596, + "loss": 3.0939, + "theoretical_loss": 3.815433887227228, + "tokens_seen": 638127104 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040739217652958876, + "loss": 3.0835, + "theoretical_loss": 3.815393443337677, + "tokens_seen": 638192640 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407382146439318, + "loss": 3.1052, + "theoretical_loss": 3.8153530047638498, + "tokens_seen": 638258176 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073721163490471, + "loss": 2.8899, + "theoretical_loss": 3.8153125715045038, + "tokens_seen": 638323712 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040736208625877636, + "loss": 2.9293, + "theoretical_loss": 3.8152721435583934, + "tokens_seen": 638389248 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073520561685055, + "loss": 2.7819, + "theoretical_loss": 3.815231720924276, + "tokens_seen": 638454784 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073420260782347, + "loss": 3.0613, + "theoretical_loss": 3.815191303600909, + "tokens_seen": 638520320 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073319959879639, + "loss": 2.9329, + "theoretical_loss": 3.8151508915870482, + "tokens_seen": 638585856 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073219658976931, + "loss": 2.998, + "theoretical_loss": 3.8151104848814525, + "tokens_seen": 638651392 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040731193580742227, + "loss": 3.0529, + "theoretical_loss": 3.8150700834828797, + "tokens_seen": 638716928 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004073019057171515, + "loss": 3.0643, + "theoretical_loss": 3.8150296873900884, + "tokens_seen": 638782464 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040729187562688063, + "loss": 2.9948, + "theoretical_loss": 3.814989296601837, + "tokens_seen": 638848000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040728184553660986, + "loss": 2.9037, + "theoretical_loss": 3.814948911116886, + "tokens_seen": 638913536 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1038718, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.782565116882324, + "objective/train/theoretical_loss": 3.8149186254827168, + "objective/train/tokens_used": 659422688, + "theoretical_loss": 3.8149186254827168, + "tokens_seen": 638962688 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407271815446339, + "loss": 2.8597, + "theoretical_loss": 3.8149085309339945, + "tokens_seen": 638979072 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004072617853560682, + "loss": 3.0655, + "theoretical_loss": 3.8148681560519226, + "tokens_seen": 639044608 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004072517552657974, + "loss": 3.0783, + "theoretical_loss": 3.814827786469431, + "tokens_seen": 639110144 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004072417251755266, + "loss": 3.0081, + "theoretical_loss": 3.8147874221852813, + "tokens_seen": 639175680 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040723169508525577, + "loss": 3.0959, + "theoretical_loss": 3.8147470631982348, + "tokens_seen": 639241216 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040722166499498495, + "loss": 2.8701, + "theoretical_loss": 3.8147067095070524, + "tokens_seen": 639306752 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040721163490471413, + "loss": 2.9427, + "theoretical_loss": 3.814666361110498, + "tokens_seen": 639372288 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040720160481444337, + "loss": 2.9532, + "theoretical_loss": 3.814626018007333, + "tokens_seen": 639437824 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004071915747241725, + "loss": 2.9825, + "theoretical_loss": 3.8145856801963207, + "tokens_seen": 639503360 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040718154463390173, + "loss": 2.8944, + "theoretical_loss": 3.8145453476762254, + "tokens_seen": 639568896 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004071715145436309, + "loss": 2.9107, + "theoretical_loss": 3.814505020445811, + "tokens_seen": 639634432 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004071614844533601, + "loss": 2.9519, + "theoretical_loss": 3.8144646985038406, + "tokens_seen": 639699968 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040715145436308927, + "loss": 2.9554, + "theoretical_loss": 3.814424381849081, + "tokens_seen": 639765504 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040714142427281845, + "loss": 2.929, + "theoretical_loss": 3.814384070480296, + "tokens_seen": 639831040 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040713139418254763, + "loss": 2.9293, + "theoretical_loss": 3.8143437643962512, + "tokens_seen": 639896576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040712136409227687, + "loss": 3.0959, + "theoretical_loss": 3.8143034635957127, + "tokens_seen": 639962112 + }, + { + "epoch": 1.09, + "learning_rate": 0.000407111334002006, + "loss": 3.1705, + "theoretical_loss": 3.8142631680774484, + "tokens_seen": 640027648 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040710130391173523, + "loss": 3.0691, + "theoretical_loss": 3.8142228778402227, + "tokens_seen": 640093184 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040709127382146436, + "loss": 3.0325, + "theoretical_loss": 3.8141825928828053, + "tokens_seen": 640158720 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004070812437311936, + "loss": 2.8752, + "theoretical_loss": 3.8141423132039622, + "tokens_seen": 640224256 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004070712136409228, + "loss": 3.1857, + "theoretical_loss": 3.8141020388024627, + "tokens_seen": 640289792 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040706118355065196, + "loss": 3.0557, + "theoretical_loss": 3.814061769677074, + "tokens_seen": 640355328 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040705115346038114, + "loss": 3.0972, + "theoretical_loss": 3.8140215058265667, + "tokens_seen": 640420864 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004070411233701103, + "loss": 3.0665, + "theoretical_loss": 3.8139812472497088, + "tokens_seen": 640486400 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004070310932798395, + "loss": 3.0035, + "theoretical_loss": 3.8139409939452706, + "tokens_seen": 640551936 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1041494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1155953407287598, + "objective/train/theoretical_loss": 3.813910807426227, + "objective/train/tokens_used": 661061088, + "theoretical_loss": 3.813910807426227, + "tokens_seen": 640601088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040702106318956874, + "loss": 3.0247, + "theoretical_loss": 3.8139007459120218, + "tokens_seen": 640617472 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040701103309929786, + "loss": 2.9861, + "theoretical_loss": 3.8138605031487334, + "tokens_seen": 640683008 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004070010030090271, + "loss": 2.9838, + "theoretical_loss": 3.8138202656541766, + "tokens_seen": 640748544 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004069909729187563, + "loss": 3.0363, + "theoretical_loss": 3.8137800334271232, + "tokens_seen": 640814080 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040698094282848546, + "loss": 2.9208, + "theoretical_loss": 3.813739806466344, + "tokens_seen": 640879616 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040697091273821464, + "loss": 3.1229, + "theoretical_loss": 3.813699584770611, + "tokens_seen": 640945152 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004069608826479438, + "loss": 2.8541, + "theoretical_loss": 3.8136593683386986, + "tokens_seen": 641010688 + }, + { + "epoch": 1.09, + "learning_rate": 0.000406950852557673, + "loss": 2.975, + "theoretical_loss": 3.8136191571693785, + "tokens_seen": 641076224 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040694082246740224, + "loss": 3.0242, + "theoretical_loss": 3.813578951261424, + "tokens_seen": 641141760 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040693079237713137, + "loss": 3.0852, + "theoretical_loss": 3.81353875061361, + "tokens_seen": 641207296 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004069207622868606, + "loss": 3.1881, + "theoretical_loss": 3.81349855522471, + "tokens_seen": 641272832 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040691073219658973, + "loss": 3.0544, + "theoretical_loss": 3.8134583650934992, + "tokens_seen": 641338368 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040690070210631896, + "loss": 3.0751, + "theoretical_loss": 3.813418180218752, + "tokens_seen": 641403904 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068906720160482, + "loss": 2.9386, + "theoretical_loss": 3.8133780005992444, + "tokens_seen": 641469440 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068806419257773, + "loss": 2.9529, + "theoretical_loss": 3.813337826233753, + "tokens_seen": 641534976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040687061183550656, + "loss": 2.831, + "theoretical_loss": 3.813297657121053, + "tokens_seen": 641600512 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068605817452357, + "loss": 2.8515, + "theoretical_loss": 3.813257493259922, + "tokens_seen": 641666048 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068505516549649, + "loss": 3.1408, + "theoretical_loss": 3.8132173346491367, + "tokens_seen": 641731584 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068405215646941, + "loss": 3.0993, + "theoretical_loss": 3.8131771812874744, + "tokens_seen": 641797120 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068304914744233, + "loss": 2.9286, + "theoretical_loss": 3.8131370331737138, + "tokens_seen": 641862656 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040682046138415247, + "loss": 3.0802, + "theoretical_loss": 3.8130968903066336, + "tokens_seen": 641928192 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004068104312938817, + "loss": 3.1549, + "theoretical_loss": 3.8130567526850108, + "tokens_seen": 641993728 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040680040120361083, + "loss": 3.0608, + "theoretical_loss": 3.813016620307627, + "tokens_seen": 642059264 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040679037111334006, + "loss": 2.9957, + "theoretical_loss": 3.8129764931732595, + "tokens_seen": 642124800 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004067803410230692, + "loss": 2.9311, + "theoretical_loss": 3.8129363712806903, + "tokens_seen": 642190336 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1044237, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.074524402618408, + "objective/train/theoretical_loss": 3.812906283300459, + "objective/train/tokens_used": 662699488, + "theoretical_loss": 3.812906283300459, + "tokens_seen": 642239488 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004067703109327984, + "loss": 3.0705, + "theoretical_loss": 3.812896254628699, + "tokens_seen": 642255872 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004067602808425276, + "loss": 2.9007, + "theoretical_loss": 3.8128561432160657, + "tokens_seen": 642321408 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004067502507522568, + "loss": 2.913, + "theoretical_loss": 3.812816037041573, + "tokens_seen": 642386944 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040674022066198597, + "loss": 2.7691, + "theoretical_loss": 3.812775936104002, + "tokens_seen": 642452480 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040673019057171515, + "loss": 3.0131, + "theoretical_loss": 3.8127358404021345, + "tokens_seen": 642518016 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040672016048144433, + "loss": 3.0166, + "theoretical_loss": 3.812695749934753, + "tokens_seen": 642583552 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040671013039117357, + "loss": 2.9131, + "theoretical_loss": 3.8126556647006415, + "tokens_seen": 642649088 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004067001003009027, + "loss": 3.1034, + "theoretical_loss": 3.812615584698582, + "tokens_seen": 642714624 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040669007021063193, + "loss": 3.1604, + "theoretical_loss": 3.812575509927359, + "tokens_seen": 642780160 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004066800401203611, + "loss": 2.8967, + "theoretical_loss": 3.812535440385755, + "tokens_seen": 642845696 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004066700100300903, + "loss": 3.0675, + "theoretical_loss": 3.8124953760725564, + "tokens_seen": 642911232 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040665997993981947, + "loss": 2.9658, + "theoretical_loss": 3.812455316986548, + "tokens_seen": 642976768 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040664994984954865, + "loss": 3.0054, + "theoretical_loss": 3.812415263126514, + "tokens_seen": 643042304 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040663991975927783, + "loss": 3.1507, + "theoretical_loss": 3.8123752144912406, + "tokens_seen": 643107840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040662988966900707, + "loss": 3.0138, + "theoretical_loss": 3.812335171079514, + "tokens_seen": 643173376 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004066198595787362, + "loss": 3.0131, + "theoretical_loss": 3.812295132890122, + "tokens_seen": 643238912 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040660982948846543, + "loss": 2.8962, + "theoretical_loss": 3.812255099921849, + "tokens_seen": 643304448 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040659979939819456, + "loss": 3.0175, + "theoretical_loss": 3.812215072173484, + "tokens_seen": 643369984 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004065897693079238, + "loss": 3.0497, + "theoretical_loss": 3.8121750496438147, + "tokens_seen": 643435520 + }, + { + "epoch": 1.09, + "learning_rate": 0.000406579739217653, + "loss": 3.0187, + "theoretical_loss": 3.812135032331629, + "tokens_seen": 643501056 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040656970912738216, + "loss": 2.8786, + "theoretical_loss": 3.8120950202357156, + "tokens_seen": 643566592 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040655967903711134, + "loss": 2.9167, + "theoretical_loss": 3.812055013354863, + "tokens_seen": 643632128 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004065496489468405, + "loss": 3.0895, + "theoretical_loss": 3.812015011687861, + "tokens_seen": 643697664 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004065396188565697, + "loss": 2.9511, + "theoretical_loss": 3.8119750152335, + "tokens_seen": 643763200 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040652958876629894, + "loss": 2.9082, + "theoretical_loss": 3.8119350239905687, + "tokens_seen": 643828736 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1046759, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.072484254837036, + "objective/train/theoretical_loss": 3.8119050339776446, + "objective/train/tokens_used": 664337888, + "theoretical_loss": 3.8119050339776446, + "tokens_seen": 643877888 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040651955867602806, + "loss": 3.0148, + "theoretical_loss": 3.811895037957859, + "tokens_seen": 643894272 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004065095285857573, + "loss": 3.0359, + "theoretical_loss": 3.8118550571341614, + "tokens_seen": 643959808 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004064994984954865, + "loss": 2.9808, + "theoretical_loss": 3.811815081518267, + "tokens_seen": 644025344 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040648946840521566, + "loss": 3.1016, + "theoretical_loss": 3.811775111108968, + "tokens_seen": 644090880 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040647943831494484, + "loss": 2.8211, + "theoretical_loss": 3.811735145905057, + "tokens_seen": 644156416 + }, + { + "epoch": 1.09, + "learning_rate": 0.000406469408224674, + "loss": 3.0608, + "theoretical_loss": 3.8116951859053256, + "tokens_seen": 644221952 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004064593781344032, + "loss": 2.9489, + "theoretical_loss": 3.811655231108568, + "tokens_seen": 644287488 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040644934804413244, + "loss": 3.0908, + "theoretical_loss": 3.8116152815135758, + "tokens_seen": 644353024 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040643931795386157, + "loss": 3.0273, + "theoretical_loss": 3.8115753371191445, + "tokens_seen": 644418560 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004064292878635908, + "loss": 2.9452, + "theoretical_loss": 3.811535397924068, + "tokens_seen": 644484096 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040641925777331993, + "loss": 3.0304, + "theoretical_loss": 3.8114954639271406, + "tokens_seen": 644549632 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040640922768304916, + "loss": 2.9839, + "theoretical_loss": 3.8114555351271577, + "tokens_seen": 644615168 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040639919759277834, + "loss": 3.0105, + "theoretical_loss": 3.811415611522914, + "tokens_seen": 644680704 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004063891675025075, + "loss": 3.0383, + "theoretical_loss": 3.811375693113206, + "tokens_seen": 644746240 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004063791374122367, + "loss": 2.9634, + "theoretical_loss": 3.8113357798968295, + "tokens_seen": 644811776 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004063691073219659, + "loss": 2.9504, + "theoretical_loss": 3.8112958718725816, + "tokens_seen": 644877312 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040635907723169507, + "loss": 3.0089, + "theoretical_loss": 3.811255969039259, + "tokens_seen": 644942848 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004063490471414243, + "loss": 2.9469, + "theoretical_loss": 3.811216071395659, + "tokens_seen": 645008384 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040633901705115343, + "loss": 3.0354, + "theoretical_loss": 3.81117617894058, + "tokens_seen": 645073920 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040632898696088267, + "loss": 2.9363, + "theoretical_loss": 3.8111362916728195, + "tokens_seen": 645139456 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040631895687061185, + "loss": 2.9618, + "theoretical_loss": 3.811096409591177, + "tokens_seen": 645204992 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040630892678034103, + "loss": 2.9966, + "theoretical_loss": 3.81105653269445, + "tokens_seen": 645270528 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062988966900702, + "loss": 2.9729, + "theoretical_loss": 3.8110166609814398, + "tokens_seen": 645336064 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062888665997994, + "loss": 3.0531, + "theoretical_loss": 3.810976794450945, + "tokens_seen": 645401600 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040627883650952857, + "loss": 2.9666, + "theoretical_loss": 3.8109369331017664, + "tokens_seen": 645467136 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1049677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0577948093414307, + "objective/train/theoretical_loss": 3.8109070404893997, + "objective/train/tokens_used": 665976288, + "theoretical_loss": 3.8109070404893997, + "tokens_seen": 645516288 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062688064192578, + "loss": 3.0647, + "theoretical_loss": 3.8108970769327053, + "tokens_seen": 645532672 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040625877632898693, + "loss": 2.8361, + "theoretical_loss": 3.8108572259425606, + "tokens_seen": 645598208 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040624874623871617, + "loss": 3.108, + "theoretical_loss": 3.8108173801301355, + "tokens_seen": 645663744 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062387161484453, + "loss": 2.9224, + "theoretical_loss": 3.8107775394942314, + "tokens_seen": 645729280 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040622868605817453, + "loss": 3.0057, + "theoretical_loss": 3.8107377040336505, + "tokens_seen": 645794816 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062186559679037, + "loss": 2.9894, + "theoretical_loss": 3.8106978737471953, + "tokens_seen": 645860352 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004062086258776329, + "loss": 3.0829, + "theoretical_loss": 3.8106580486336687, + "tokens_seen": 645925888 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004061985957873621, + "loss": 2.9901, + "theoretical_loss": 3.8106182286918746, + "tokens_seen": 645991424 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040618856569709126, + "loss": 2.9563, + "theoretical_loss": 3.8105784139206165, + "tokens_seen": 646056960 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040617853560682044, + "loss": 2.9716, + "theoretical_loss": 3.810538604318699, + "tokens_seen": 646122496 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040616850551654967, + "loss": 3.0088, + "theoretical_loss": 3.8104987998849262, + "tokens_seen": 646188032 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004061584754262788, + "loss": 2.9798, + "theoretical_loss": 3.810459000618103, + "tokens_seen": 646253568 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040614844533600804, + "loss": 2.987, + "theoretical_loss": 3.8104192065170355, + "tokens_seen": 646319104 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040613841524573727, + "loss": 2.9985, + "theoretical_loss": 3.810379417580529, + "tokens_seen": 646384640 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004061283851554664, + "loss": 3.0295, + "theoretical_loss": 3.81033963380739, + "tokens_seen": 646450176 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040611835506519563, + "loss": 3.0485, + "theoretical_loss": 3.810299855196425, + "tokens_seen": 646515712 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040610832497492476, + "loss": 2.9221, + "theoretical_loss": 3.810260081746441, + "tokens_seen": 646581248 + }, + { + "epoch": 1.09, + "learning_rate": 0.000406098294884654, + "loss": 3.0475, + "theoretical_loss": 3.810220313456245, + "tokens_seen": 646646784 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004060882647943832, + "loss": 2.8982, + "theoretical_loss": 3.8101805503246453, + "tokens_seen": 646712320 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040607823470411236, + "loss": 3.1093, + "theoretical_loss": 3.81014079235045, + "tokens_seen": 646777856 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040606820461384154, + "loss": 2.9731, + "theoretical_loss": 3.8101010395324675, + "tokens_seen": 646843392 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004060581745235707, + "loss": 2.9952, + "theoretical_loss": 3.8100612918695074, + "tokens_seen": 646908928 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004060481444332999, + "loss": 2.9028, + "theoretical_loss": 3.810021549360378, + "tokens_seen": 646974464 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040603811434302914, + "loss": 2.9859, + "theoretical_loss": 3.8099818120038895, + "tokens_seen": 647040000 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040602808425275826, + "loss": 3.058, + "theoretical_loss": 3.8099420797988524, + "tokens_seen": 647105536 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1050941, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0022668838500977, + "objective/train/theoretical_loss": 3.8099122840249997, + "objective/train/tokens_used": 667614688, + "theoretical_loss": 3.8099122840249997, + "tokens_seen": 647154688 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004060180541624875, + "loss": 3.0189, + "theoretical_loss": 3.8099023527440776, + "tokens_seen": 647171072 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004060080240722167, + "loss": 2.9058, + "theoretical_loss": 3.809862630838375, + "tokens_seen": 647236608 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040599799398194586, + "loss": 3.0904, + "theoretical_loss": 3.809822914080557, + "tokens_seen": 647302144 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040598796389167504, + "loss": 3.0614, + "theoretical_loss": 3.8097832024694345, + "tokens_seen": 647367680 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004059779338014042, + "loss": 2.904, + "theoretical_loss": 3.8097434960038203, + "tokens_seen": 647433216 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004059679037111334, + "loss": 3.0316, + "theoretical_loss": 3.8097037946825263, + "tokens_seen": 647498752 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040595787362086264, + "loss": 2.9735, + "theoretical_loss": 3.8096640985043657, + "tokens_seen": 647564288 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040594784353059177, + "loss": 3.0541, + "theoretical_loss": 3.809624407468152, + "tokens_seen": 647629824 + }, + { + "epoch": 1.09, + "learning_rate": 0.000405937813440321, + "loss": 2.9443, + "theoretical_loss": 3.8095847215726995, + "tokens_seen": 647695360 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040592778335005013, + "loss": 3.2039, + "theoretical_loss": 3.8095450408168205, + "tokens_seen": 647760896 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040591775325977936, + "loss": 2.9951, + "theoretical_loss": 3.8095053651993314, + "tokens_seen": 647826432 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040590772316950854, + "loss": 3.0554, + "theoretical_loss": 3.8094656947190457, + "tokens_seen": 647891968 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004058976930792377, + "loss": 3.059, + "theoretical_loss": 3.80942602937478, + "tokens_seen": 647957504 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004058876629889669, + "loss": 2.8827, + "theoretical_loss": 3.809386369165349, + "tokens_seen": 648023040 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004058776328986961, + "loss": 2.9656, + "theoretical_loss": 3.809346714089569, + "tokens_seen": 648088576 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040586760280842527, + "loss": 2.8847, + "theoretical_loss": 3.8093070641462567, + "tokens_seen": 648154112 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004058575727181545, + "loss": 3.0231, + "theoretical_loss": 3.809267419334229, + "tokens_seen": 648219648 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040584754262788363, + "loss": 3.0152, + "theoretical_loss": 3.8092277796523026, + "tokens_seen": 648285184 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040583751253761287, + "loss": 3.026, + "theoretical_loss": 3.8091881450992955, + "tokens_seen": 648350720 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040582748244734205, + "loss": 2.9821, + "theoretical_loss": 3.809148515674026, + "tokens_seen": 648416256 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040581745235707123, + "loss": 3.0289, + "theoretical_loss": 3.8091088913753124, + "tokens_seen": 648481792 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004058074222668004, + "loss": 3.0747, + "theoretical_loss": 3.809069272201973, + "tokens_seen": 648547328 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004057973921765296, + "loss": 2.8005, + "theoretical_loss": 3.809029658152828, + "tokens_seen": 648612864 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040578736208625877, + "loss": 3.063, + "theoretical_loss": 3.808990049226696, + "tokens_seen": 648678400 + }, + { + "epoch": 1.09, + "learning_rate": 0.000405777331995988, + "loss": 2.9478, + "theoretical_loss": 3.808950445422398, + "tokens_seen": 648743936 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1053673, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0664942264556885, + "objective/train/theoretical_loss": 3.8089207459296675, + "objective/train/tokens_used": 669253088, + "theoretical_loss": 3.8089207459296675, + "tokens_seen": 648793088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040576730190571713, + "loss": 3.0174, + "theoretical_loss": 3.808910846738753, + "tokens_seen": 648809472 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040575727181544637, + "loss": 3.0262, + "theoretical_loss": 3.808871253174583, + "tokens_seen": 648875008 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004057472417251755, + "loss": 2.9287, + "theoretical_loss": 3.808831664728709, + "tokens_seen": 648940544 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040573721163490473, + "loss": 3.0383, + "theoretical_loss": 3.808792081399952, + "tokens_seen": 649006080 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004057271815446339, + "loss": 2.8112, + "theoretical_loss": 3.8087525031871348, + "tokens_seen": 649071616 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004057171514543631, + "loss": 3.0553, + "theoretical_loss": 3.8087129300890785, + "tokens_seen": 649137152 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004057071213640923, + "loss": 3.1193, + "theoretical_loss": 3.8086733621046074, + "tokens_seen": 649202688 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040569709127382146, + "loss": 3.0329, + "theoretical_loss": 3.8086337992325436, + "tokens_seen": 649268224 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040568706118355064, + "loss": 3.0729, + "theoretical_loss": 3.8085942414717104, + "tokens_seen": 649333760 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004056770310932799, + "loss": 2.9673, + "theoretical_loss": 3.8085546888209323, + "tokens_seen": 649399296 + }, + { + "epoch": 1.09, + "learning_rate": 0.000405667001003009, + "loss": 3.0417, + "theoretical_loss": 3.8085151412790332, + "tokens_seen": 649464832 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040565697091273824, + "loss": 3.0494, + "theoretical_loss": 3.808475598844838, + "tokens_seen": 649530368 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004056469408224674, + "loss": 2.9805, + "theoretical_loss": 3.808436061517172, + "tokens_seen": 649595904 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004056369107321966, + "loss": 2.8847, + "theoretical_loss": 3.8083965292948605, + "tokens_seen": 649661440 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004056268806419258, + "loss": 3.0218, + "theoretical_loss": 3.808357002176729, + "tokens_seen": 649726976 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040561685055165496, + "loss": 3.0313, + "theoretical_loss": 3.8083174801616044, + "tokens_seen": 649792512 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040560682046138414, + "loss": 3.0651, + "theoretical_loss": 3.8082779632483126, + "tokens_seen": 649858048 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004055967903711134, + "loss": 2.9259, + "theoretical_loss": 3.80823845143568, + "tokens_seen": 649923584 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004055867602808425, + "loss": 2.9715, + "theoretical_loss": 3.808198944722536, + "tokens_seen": 649989120 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040557673019057174, + "loss": 3.1077, + "theoretical_loss": 3.8081594431077073, + "tokens_seen": 650054656 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040556670010030087, + "loss": 2.8847, + "theoretical_loss": 3.808119946590022, + "tokens_seen": 650120192 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004055566700100301, + "loss": 2.8808, + "theoretical_loss": 3.808080455168308, + "tokens_seen": 650185728 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004055466399197593, + "loss": 3.0093, + "theoretical_loss": 3.8080409688413956, + "tokens_seen": 650251264 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040553660982948846, + "loss": 3.0028, + "theoretical_loss": 3.808001487608113, + "tokens_seen": 650316800 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040552657973921764, + "loss": 2.9153, + "theoretical_loss": 3.807962011467291, + "tokens_seen": 650382336 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1056476, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2774388790130615, + "objective/train/theoretical_loss": 3.8079324077028973, + "objective/train/tokens_used": 670891488, + "theoretical_loss": 3.8079324077028973, + "tokens_seen": 650431488 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004055165496489469, + "loss": 3.2043, + "theoretical_loss": 3.807922540417759, + "tokens_seen": 650447872 + }, + { + "epoch": 1.09, + "learning_rate": 0.000405506519558676, + "loss": 2.9786, + "theoretical_loss": 3.8078830744583474, + "tokens_seen": 650513408 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040549648946840524, + "loss": 3.0599, + "theoretical_loss": 3.8078436135878873, + "tokens_seen": 650578944 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040548645937813437, + "loss": 3.0419, + "theoretical_loss": 3.8078041578052098, + "tokens_seen": 650644480 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004054764292878636, + "loss": 3.0787, + "theoretical_loss": 3.807764707109147, + "tokens_seen": 650710016 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004054663991975928, + "loss": 3.1432, + "theoretical_loss": 3.807725261498531, + "tokens_seen": 650775552 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040545636910732197, + "loss": 3.0787, + "theoretical_loss": 3.8076858209721935, + "tokens_seen": 650841088 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040544633901705115, + "loss": 3.0804, + "theoretical_loss": 3.807646385528968, + "tokens_seen": 650906624 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040543630892678033, + "loss": 2.979, + "theoretical_loss": 3.807606955167687, + "tokens_seen": 650972160 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004054262788365095, + "loss": 2.8495, + "theoretical_loss": 3.8075675298871845, + "tokens_seen": 651037696 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040541624874623874, + "loss": 3.0855, + "theoretical_loss": 3.8075281096862947, + "tokens_seen": 651103232 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040540621865596787, + "loss": 3.0643, + "theoretical_loss": 3.8074886945638515, + "tokens_seen": 651168768 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004053961885656971, + "loss": 2.9542, + "theoretical_loss": 3.80744928451869, + "tokens_seen": 651234304 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004053861584754263, + "loss": 2.9946, + "theoretical_loss": 3.8074098795496454, + "tokens_seen": 651299840 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040537612838515547, + "loss": 2.9945, + "theoretical_loss": 3.8073704796555523, + "tokens_seen": 651365376 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004053660982948847, + "loss": 2.9478, + "theoretical_loss": 3.8073310848352477, + "tokens_seen": 651430912 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040535606820461383, + "loss": 2.9612, + "theoretical_loss": 3.8072916950875673, + "tokens_seen": 651496448 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040534603811434307, + "loss": 3.172, + "theoretical_loss": 3.807252310411348, + "tokens_seen": 651561984 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040533600802407225, + "loss": 3.0578, + "theoretical_loss": 3.8072129308054263, + "tokens_seen": 651627520 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040532597793380143, + "loss": 3.0132, + "theoretical_loss": 3.8071735562686406, + "tokens_seen": 651693056 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004053159478435306, + "loss": 2.9483, + "theoretical_loss": 3.807134186799828, + "tokens_seen": 651758592 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004053059177532598, + "loss": 2.9313, + "theoretical_loss": 3.8070948223978265, + "tokens_seen": 651824128 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040529588766298897, + "loss": 3.0226, + "theoretical_loss": 3.8070554630614755, + "tokens_seen": 651889664 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004052858575727182, + "loss": 2.8719, + "theoretical_loss": 3.8070161087896137, + "tokens_seen": 651955200 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040527582748244733, + "loss": 3.0372, + "theoretical_loss": 3.8069767595810795, + "tokens_seen": 652020736 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1059228, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6596765518188477, + "objective/train/theoretical_loss": 3.8069472509967905, + "objective/train/tokens_used": 672529888, + "theoretical_loss": 3.8069472509967905, + "tokens_seen": 652069888 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040526579739217657, + "loss": 2.8596, + "theoretical_loss": 3.8069374154347138, + "tokens_seen": 652086272 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004052557673019057, + "loss": 3.0032, + "theoretical_loss": 3.806898076349356, + "tokens_seen": 652151808 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040524573721163493, + "loss": 3.0353, + "theoretical_loss": 3.806858742323847, + "tokens_seen": 652217344 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004052357071213641, + "loss": 3.0557, + "theoretical_loss": 3.8068194133570277, + "tokens_seen": 652282880 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004052256770310933, + "loss": 3.0939, + "theoretical_loss": 3.806780089447739, + "tokens_seen": 652348416 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004052156469408225, + "loss": 3.0512, + "theoretical_loss": 3.806740770594822, + "tokens_seen": 652413952 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040520561685055166, + "loss": 3.1513, + "theoretical_loss": 3.8067014567971205, + "tokens_seen": 652479488 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040519558676028084, + "loss": 2.9769, + "theoretical_loss": 3.8066621480534755, + "tokens_seen": 652545024 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004051855566700101, + "loss": 3.042, + "theoretical_loss": 3.80662284436273, + "tokens_seen": 652610560 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004051755265797392, + "loss": 2.8542, + "theoretical_loss": 3.8065835457237274, + "tokens_seen": 652676096 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040516549648946844, + "loss": 2.8925, + "theoretical_loss": 3.8065442521353106, + "tokens_seen": 652741632 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004051554663991976, + "loss": 3.2298, + "theoretical_loss": 3.806504963596324, + "tokens_seen": 652807168 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004051454363089268, + "loss": 2.8951, + "theoretical_loss": 3.8064656801056125, + "tokens_seen": 652872704 + }, + { + "epoch": 1.09, + "learning_rate": 0.000405135406218656, + "loss": 3.0054, + "theoretical_loss": 3.80642640166202, + "tokens_seen": 652938240 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040512537612838516, + "loss": 2.9205, + "theoretical_loss": 3.8063871282643915, + "tokens_seen": 653003776 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040511534603811434, + "loss": 2.8846, + "theoretical_loss": 3.806347859911573, + "tokens_seen": 653069312 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004051053159478436, + "loss": 3.0513, + "theoretical_loss": 3.8063085966024097, + "tokens_seen": 653134848 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004050952858575727, + "loss": 2.9379, + "theoretical_loss": 3.8062693383357487, + "tokens_seen": 653200384 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040508525576730194, + "loss": 3.2126, + "theoretical_loss": 3.8062300851104354, + "tokens_seen": 653265920 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040507522567703107, + "loss": 3.0067, + "theoretical_loss": 3.806190836925318, + "tokens_seen": 653331456 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004050651955867603, + "loss": 3.0491, + "theoretical_loss": 3.8061515937792425, + "tokens_seen": 653396992 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004050551654964895, + "loss": 2.9017, + "theoretical_loss": 3.806112355671058, + "tokens_seen": 653462528 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040504513540621866, + "loss": 3.0469, + "theoretical_loss": 3.806073122599612, + "tokens_seen": 653528064 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040503510531594784, + "loss": 2.9112, + "theoretical_loss": 3.806033894563752, + "tokens_seen": 653593600 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004050250752256771, + "loss": 3.0513, + "theoretical_loss": 3.8059946715623285, + "tokens_seen": 653659136 + }, + { + "epoch": 1.09, + "objective/train/docs_used": 1061949, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1434218883514404, + "objective/train/theoretical_loss": 3.8059652576144165, + "objective/train/tokens_used": 674168288, + "theoretical_loss": 3.8059652576144165, + "tokens_seen": 653708288 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004050150451354062, + "loss": 3.0379, + "theoretical_loss": 3.8059554535941897, + "tokens_seen": 653724672 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040500501504513544, + "loss": 3.0046, + "theoretical_loss": 3.8059162406581857, + "tokens_seen": 653790208 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040499498495486457, + "loss": 3.017, + "theoretical_loss": 3.805877032753166, + "tokens_seen": 653855744 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004049849548645938, + "loss": 3.0414, + "theoretical_loss": 3.805837829877982, + "tokens_seen": 653921280 + }, + { + "epoch": 1.09, + "learning_rate": 0.000404974924774323, + "loss": 2.8866, + "theoretical_loss": 3.8057986320314834, + "tokens_seen": 653986816 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040496489468405217, + "loss": 2.8942, + "theoretical_loss": 3.8057594392125207, + "tokens_seen": 654052352 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040495486459378135, + "loss": 3.0617, + "theoretical_loss": 3.8057202514199475, + "tokens_seen": 654117888 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040494483450351053, + "loss": 3.1641, + "theoretical_loss": 3.805681068652614, + "tokens_seen": 654183424 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004049348044132397, + "loss": 2.9199, + "theoretical_loss": 3.8056418909093734, + "tokens_seen": 654248960 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040492477432296894, + "loss": 2.901, + "theoretical_loss": 3.805602718189078, + "tokens_seen": 654314496 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040491474423269807, + "loss": 2.997, + "theoretical_loss": 3.8055635504905805, + "tokens_seen": 654380032 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004049047141424273, + "loss": 2.9047, + "theoretical_loss": 3.805524387812734, + "tokens_seen": 654445568 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040489468405215643, + "loss": 2.8617, + "theoretical_loss": 3.8054852301543938, + "tokens_seen": 654511104 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040488465396188567, + "loss": 2.9885, + "theoretical_loss": 3.8054460775144126, + "tokens_seen": 654576640 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040487462387161485, + "loss": 3.0452, + "theoretical_loss": 3.805406929891645, + "tokens_seen": 654642176 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040486459378134403, + "loss": 3.1981, + "theoretical_loss": 3.805367787284947, + "tokens_seen": 654707712 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004048545636910732, + "loss": 3.0076, + "theoretical_loss": 3.8053286496931724, + "tokens_seen": 654773248 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040484453360080245, + "loss": 2.9719, + "theoretical_loss": 3.8052895171151784, + "tokens_seen": 654838784 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004048345035105316, + "loss": 2.9334, + "theoretical_loss": 3.8052503895498195, + "tokens_seen": 654904320 + }, + { + "epoch": 1.09, + "learning_rate": 0.0004048244734202608, + "loss": 3.1024, + "theoretical_loss": 3.805211266995953, + "tokens_seen": 654969856 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040481444332998994, + "loss": 3.0846, + "theoretical_loss": 3.805172149452435, + "tokens_seen": 655035392 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040480441323971917, + "loss": 3.0575, + "theoretical_loss": 3.8051330369181238, + "tokens_seen": 655100928 + }, + { + "epoch": 1.09, + "learning_rate": 0.00040479438314944835, + "loss": 3.0686, + "theoretical_loss": 3.805093929391876, + "tokens_seen": 655166464 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040478435305917753, + "loss": 2.9758, + "theoretical_loss": 3.80505482687255, + "tokens_seen": 655232000 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047743229689067, + "loss": 2.9987, + "theoretical_loss": 3.8050157293590035, + "tokens_seen": 655297536 + }, + { + "debugging/Self-BLEU-5": 0.6129458043763282, + "debugging/distinct-1-grams": 0.7376882914406964, + "debugging/distinct-2-grams": 0.9413887351452321, + "debugging/entropy-1-grams": 6.247938365056754, + "debugging/entropy-2-grams": 7.456090314084394, + "debugging/length": 520.6538461538462, + "debugging/num_segments": 26, + "epoch": 1.1, + "objective/train/docs_used": 1064827, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.924983263015747, + "objective/train/theoretical_loss": 3.8049864095082, + "objective/train/tokens_used": 675806688, + "theoretical_loss": 3.8049864095082, + "tokens_seen": 655346688 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047642928786359, + "loss": 3.0139, + "theoretical_loss": 3.8049766368500952, + "tokens_seen": 655363072 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047542627883651, + "loss": 2.9704, + "theoretical_loss": 3.8049375493446846, + "tokens_seen": 655428608 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047442326980943, + "loss": 2.8868, + "theoretical_loss": 3.8048984668416312, + "tokens_seen": 655494144 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040473420260782344, + "loss": 3.0832, + "theoretical_loss": 3.804859389339794, + "tokens_seen": 655559680 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047241725175527, + "loss": 3.0266, + "theoretical_loss": 3.8048203168380335, + "tokens_seen": 655625216 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004047141424272818, + "loss": 3.0078, + "theoretical_loss": 3.8047812493352104, + "tokens_seen": 655690752 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040470411233701104, + "loss": 2.9038, + "theoretical_loss": 3.804742186830186, + "tokens_seen": 655756288 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046940822467402, + "loss": 3.0214, + "theoretical_loss": 3.8047031293218208, + "tokens_seen": 655821824 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046840521564694, + "loss": 2.995, + "theoretical_loss": 3.804664076808976, + "tokens_seen": 655887360 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046740220661986, + "loss": 2.9549, + "theoretical_loss": 3.804625029290515, + "tokens_seen": 655952896 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046639919759278, + "loss": 2.9036, + "theoretical_loss": 3.804585986765299, + "tokens_seen": 656018432 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404653961885657, + "loss": 2.9869, + "theoretical_loss": 3.8045469492321917, + "tokens_seen": 656083968 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046439317953862, + "loss": 2.9399, + "theoretical_loss": 3.8045079166900555, + "tokens_seen": 656149504 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040463390170511536, + "loss": 2.9821, + "theoretical_loss": 3.8044688891377545, + "tokens_seen": 656215040 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040462387161484454, + "loss": 3.1401, + "theoretical_loss": 3.804429866574152, + "tokens_seen": 656280576 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046138415245738, + "loss": 2.9445, + "theoretical_loss": 3.8043908489981124, + "tokens_seen": 656346112 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004046038114343029, + "loss": 2.9665, + "theoretical_loss": 3.8043518364085003, + "tokens_seen": 656411648 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040459378134403214, + "loss": 3.0989, + "theoretical_loss": 3.804312828804181, + "tokens_seen": 656477184 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040458375125376127, + "loss": 2.9916, + "theoretical_loss": 3.8042738261840197, + "tokens_seen": 656542720 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004045737211634905, + "loss": 3.0081, + "theoretical_loss": 3.804234828546882, + "tokens_seen": 656608256 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004045636910732197, + "loss": 3.0357, + "theoretical_loss": 3.804195835891634, + "tokens_seen": 656673792 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040455366098294886, + "loss": 3.1128, + "theoretical_loss": 3.804156848217142, + "tokens_seen": 656739328 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040454363089267804, + "loss": 2.9576, + "theoretical_loss": 3.804117865522273, + "tokens_seen": 656804864 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004045336008024073, + "loss": 3.0678, + "theoretical_loss": 3.804078887805895, + "tokens_seen": 656870400 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004045235707121364, + "loss": 2.9487, + "theoretical_loss": 3.8040399150668747, + "tokens_seen": 656935936 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1066283, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.047975540161133, + "objective/train/theoretical_loss": 3.8040106887783196, + "objective/train/tokens_used": 677445088, + "theoretical_loss": 3.8040106887783196, + "tokens_seen": 656985088 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040451354062186564, + "loss": 2.9402, + "theoretical_loss": 3.80400094730408, + "tokens_seen": 657001472 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040450351053159477, + "loss": 3.0742, + "theoretical_loss": 3.8039619845163797, + "tokens_seen": 657067008 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404493480441324, + "loss": 3.0508, + "theoretical_loss": 3.803923026702642, + "tokens_seen": 657132544 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004044834503510532, + "loss": 3.1139, + "theoretical_loss": 3.8038840738617368, + "tokens_seen": 657198080 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040447342026078237, + "loss": 3.0225, + "theoretical_loss": 3.8038451259925323, + "tokens_seen": 657263616 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040446339017051155, + "loss": 2.9878, + "theoretical_loss": 3.8038061830938994, + "tokens_seen": 657329152 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040445336008024073, + "loss": 3.0134, + "theoretical_loss": 3.8037672451647078, + "tokens_seen": 657394688 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004044433299899699, + "loss": 3.0155, + "theoretical_loss": 3.803728312203828, + "tokens_seen": 657460224 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040443329989969915, + "loss": 2.8763, + "theoretical_loss": 3.8036893842101316, + "tokens_seen": 657525760 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040442326980942827, + "loss": 2.9322, + "theoretical_loss": 3.803650461182489, + "tokens_seen": 657591296 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004044132397191575, + "loss": 3.0315, + "theoretical_loss": 3.8036115431197715, + "tokens_seen": 657656832 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040440320962888663, + "loss": 2.9484, + "theoretical_loss": 3.8035726300208523, + "tokens_seen": 657722368 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040439317953861587, + "loss": 3.0384, + "theoretical_loss": 3.8035337218846035, + "tokens_seen": 657787904 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040438314944834505, + "loss": 2.9101, + "theoretical_loss": 3.803494818709897, + "tokens_seen": 657853440 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040437311935807423, + "loss": 2.849, + "theoretical_loss": 3.8034559204956073, + "tokens_seen": 657918976 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004043630892678034, + "loss": 3.0163, + "theoretical_loss": 3.8034170272406067, + "tokens_seen": 657984512 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040435305917753265, + "loss": 2.9684, + "theoretical_loss": 3.8033781389437697, + "tokens_seen": 658050048 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004043430290872618, + "loss": 3.0593, + "theoretical_loss": 3.8033392556039702, + "tokens_seen": 658115584 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404332998996991, + "loss": 3.0969, + "theoretical_loss": 3.8033003772200833, + "tokens_seen": 658181120 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040432296890672014, + "loss": 3.0605, + "theoretical_loss": 3.803261503790983, + "tokens_seen": 658246656 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040431293881644937, + "loss": 3.075, + "theoretical_loss": 3.8032226353155463, + "tokens_seen": 658312192 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040430290872617855, + "loss": 3.1862, + "theoretical_loss": 3.8031837717926473, + "tokens_seen": 658377728 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040429287863590773, + "loss": 3.1704, + "theoretical_loss": 3.8031449132211623, + "tokens_seen": 658443264 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042828485456369, + "loss": 3.0729, + "theoretical_loss": 3.803106059599968, + "tokens_seen": 658508800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042728184553661, + "loss": 3.1134, + "theoretical_loss": 3.803067210927942, + "tokens_seen": 658574336 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1069115, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1162467002868652, + "objective/train/theoretical_loss": 3.803038077671138, + "objective/train/tokens_used": 679083488, + "theoretical_loss": 3.803038077671138, + "tokens_seen": 658623488 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042627883650953, + "loss": 3.2665, + "theoretical_loss": 3.803028367203961, + "tokens_seen": 658639872 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042527582748245, + "loss": 2.936, + "theoretical_loss": 3.802989528426901, + "tokens_seen": 658705408 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040424272818455364, + "loss": 3.0245, + "theoretical_loss": 3.802950694595642, + "tokens_seen": 658770944 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042326980942829, + "loss": 2.9854, + "theoretical_loss": 3.8029118657090613, + "tokens_seen": 658836480 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404222668004012, + "loss": 3.0006, + "theoretical_loss": 3.8028730417660377, + "tokens_seen": 658902016 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040421263791374124, + "loss": 3.1174, + "theoretical_loss": 3.80283422276545, + "tokens_seen": 658967552 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004042026078234704, + "loss": 3.0268, + "theoretical_loss": 3.802795408706178, + "tokens_seen": 659033088 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041925777331996, + "loss": 2.9379, + "theoretical_loss": 3.8027565995871004, + "tokens_seen": 659098624 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041825476429288, + "loss": 3.1984, + "theoretical_loss": 3.802717795407099, + "tokens_seen": 659164160 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404172517552658, + "loss": 3.0488, + "theoretical_loss": 3.802678996165053, + "tokens_seen": 659229696 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040416248746238714, + "loss": 3.1291, + "theoretical_loss": 3.8026402018598437, + "tokens_seen": 659295232 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041524573721164, + "loss": 3.0579, + "theoretical_loss": 3.802601412490352, + "tokens_seen": 659360768 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041424272818455, + "loss": 2.9796, + "theoretical_loss": 3.8025626280554596, + "tokens_seen": 659426304 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040413239719157474, + "loss": 2.9556, + "theoretical_loss": 3.8025238485540482, + "tokens_seen": 659491840 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041223671013039, + "loss": 3.1231, + "theoretical_loss": 3.8024850739850007, + "tokens_seen": 659557376 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041123370110331, + "loss": 3.0304, + "theoretical_loss": 3.802446304347199, + "tokens_seen": 659622912 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004041023069207623, + "loss": 3.1841, + "theoretical_loss": 3.802407539639527, + "tokens_seen": 659688448 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040409227683049147, + "loss": 3.0369, + "theoretical_loss": 3.802368779860867, + "tokens_seen": 659753984 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040408224674022065, + "loss": 2.9911, + "theoretical_loss": 3.8023300250101038, + "tokens_seen": 659819520 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004040722166499499, + "loss": 2.9691, + "theoretical_loss": 3.8022912750861204, + "tokens_seen": 659885056 + }, + { + "epoch": 1.1, + "learning_rate": 0.000404062186559679, + "loss": 2.9263, + "theoretical_loss": 3.8022525300878023, + "tokens_seen": 659950592 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040405215646940824, + "loss": 3.0242, + "theoretical_loss": 3.802213790014034, + "tokens_seen": 660016128 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040404212637913737, + "loss": 3.0459, + "theoretical_loss": 3.8021750548637003, + "tokens_seen": 660081664 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004040320962888666, + "loss": 2.9861, + "theoretical_loss": 3.8021363246356867, + "tokens_seen": 660147200 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004040220661985958, + "loss": 2.8293, + "theoretical_loss": 3.8020975993288797, + "tokens_seen": 660212736 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1072034, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0860753059387207, + "objective/train/theoretical_loss": 3.8020685585776466, + "objective/train/tokens_used": 680721888, + "theoretical_loss": 3.8020685585776466, + "tokens_seen": 660261888 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040401203610832497, + "loss": 3.043, + "theoretical_loss": 3.802058878942166, + "tokens_seen": 660278272 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040400200601805415, + "loss": 2.9145, + "theoretical_loss": 3.8020201634744306, + "tokens_seen": 660343808 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004039919759277834, + "loss": 3.0642, + "theoretical_loss": 3.8019814529245624, + "tokens_seen": 660409344 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004039819458375125, + "loss": 3.0309, + "theoretical_loss": 3.801942747291447, + "tokens_seen": 660474880 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040397191574724175, + "loss": 2.9826, + "theoretical_loss": 3.8019040465739735, + "tokens_seen": 660540416 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004039618856569709, + "loss": 3.017, + "theoretical_loss": 3.8018653507710294, + "tokens_seen": 660605952 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004039518555667001, + "loss": 2.9096, + "theoretical_loss": 3.8018266598815034, + "tokens_seen": 660671488 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004039418254764293, + "loss": 3.0518, + "theoretical_loss": 3.8017879739042835, + "tokens_seen": 660737024 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040393179538615847, + "loss": 3.0303, + "theoretical_loss": 3.8017492928382604, + "tokens_seen": 660802560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040392176529588765, + "loss": 3.0369, + "theoretical_loss": 3.8017106166823225, + "tokens_seen": 660868096 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040391173520561683, + "loss": 2.8301, + "theoretical_loss": 3.80167194543536, + "tokens_seen": 660933632 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040390170511534607, + "loss": 3.0697, + "theoretical_loss": 3.8016332790962633, + "tokens_seen": 660999168 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040389167502507525, + "loss": 2.8804, + "theoretical_loss": 3.8015946176639224, + "tokens_seen": 661064704 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040388164493480443, + "loss": 2.9209, + "theoretical_loss": 3.801555961137229, + "tokens_seen": 661130240 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004038716148445336, + "loss": 2.8174, + "theoretical_loss": 3.8015173095150745, + "tokens_seen": 661195776 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040386158475426285, + "loss": 2.9715, + "theoretical_loss": 3.8014786627963497, + "tokens_seen": 661261312 + }, + { + "epoch": 1.1, + "learning_rate": 0.000403851554663992, + "loss": 3.0105, + "theoretical_loss": 3.801440020979948, + "tokens_seen": 661326848 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004038415245737212, + "loss": 3.0439, + "theoretical_loss": 3.8014013840647602, + "tokens_seen": 661392384 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040383149448345034, + "loss": 3.0536, + "theoretical_loss": 3.8013627520496804, + "tokens_seen": 661457920 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040382146439317957, + "loss": 2.9549, + "theoretical_loss": 3.8013241249336014, + "tokens_seen": 661523456 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040381143430290875, + "loss": 3.0888, + "theoretical_loss": 3.801285502715417, + "tokens_seen": 661588992 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040380140421263794, + "loss": 3.0505, + "theoretical_loss": 3.8012468853940202, + "tokens_seen": 661654528 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037913741223671, + "loss": 2.9968, + "theoretical_loss": 3.8012082729683057, + "tokens_seen": 661720064 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037813440320963, + "loss": 3.1635, + "theoretical_loss": 3.8011696654371683, + "tokens_seen": 661785600 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037713139418255, + "loss": 2.8997, + "theoretical_loss": 3.8011310627995027, + "tokens_seen": 661851136 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1074885, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3360025882720947, + "objective/train/theoretical_loss": 3.80110211403193, + "objective/train/tokens_used": 682360288, + "theoretical_loss": 3.80110211403193, + "tokens_seen": 661900288 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037612838515547, + "loss": 3.0814, + "theoretical_loss": 3.8010924650542046, + "tokens_seen": 661916672 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040375125376128384, + "loss": 3.0444, + "theoretical_loss": 3.801053872200169, + "tokens_seen": 661982208 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037412236710131, + "loss": 3.0795, + "theoretical_loss": 3.801015284236292, + "tokens_seen": 662047744 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037311935807422, + "loss": 2.8017, + "theoretical_loss": 3.8009767011614706, + "tokens_seen": 662113280 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040372116349047144, + "loss": 2.9961, + "theoretical_loss": 3.8009381229746007, + "tokens_seen": 662178816 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037111334002006, + "loss": 3.2018, + "theoretical_loss": 3.8008995496745803, + "tokens_seen": 662244352 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004037011033099298, + "loss": 2.93, + "theoretical_loss": 3.800860981260306, + "tokens_seen": 662309888 + }, + { + "epoch": 1.1, + "learning_rate": 0.000403691073219659, + "loss": 3.2136, + "theoretical_loss": 3.800822417730676, + "tokens_seen": 662375424 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036810431293882, + "loss": 2.9591, + "theoretical_loss": 3.800783859084589, + "tokens_seen": 662440960 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040367101303911734, + "loss": 3.0411, + "theoretical_loss": 3.800745305320943, + "tokens_seen": 662506496 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036609829488466, + "loss": 3.0493, + "theoretical_loss": 3.800706756438636, + "tokens_seen": 662572032 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036509528585757, + "loss": 3.1743, + "theoretical_loss": 3.8006682124365687, + "tokens_seen": 662637568 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040364092276830494, + "loss": 2.8823, + "theoretical_loss": 3.8006296733136398, + "tokens_seen": 662703104 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036308926780341, + "loss": 2.9143, + "theoretical_loss": 3.8005911390687497, + "tokens_seen": 662768640 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036208625877633, + "loss": 3.0754, + "theoretical_loss": 3.8005526097007984, + "tokens_seen": 662834176 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004036108324974925, + "loss": 3.0101, + "theoretical_loss": 3.800514085208687, + "tokens_seen": 662899712 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040360080240722167, + "loss": 2.794, + "theoretical_loss": 3.800475565591316, + "tokens_seen": 662965248 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040359077231695085, + "loss": 2.9964, + "theoretical_loss": 3.800437050847587, + "tokens_seen": 663030784 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004035807422266801, + "loss": 2.9894, + "theoretical_loss": 3.800398540976402, + "tokens_seen": 663096320 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004035707121364092, + "loss": 3.0188, + "theoretical_loss": 3.800360035976663, + "tokens_seen": 663161856 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040356068204613844, + "loss": 3.0797, + "theoretical_loss": 3.8003215358472717, + "tokens_seen": 663227392 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040355065195586757, + "loss": 3.0494, + "theoretical_loss": 3.800283040587132, + "tokens_seen": 663292928 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004035406218655968, + "loss": 2.9143, + "theoretical_loss": 3.8002445501951465, + "tokens_seen": 663358464 + }, + { + "epoch": 1.1, + "learning_rate": 0.000403530591775326, + "loss": 3.0102, + "theoretical_loss": 3.8002060646702187, + "tokens_seen": 663424000 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040352056168505517, + "loss": 2.8428, + "theoretical_loss": 3.8001675840112528, + "tokens_seen": 663489536 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1223957538604736, + "objective/train/theoretical_loss": 3.800138726709656, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.800138726709656, + "tokens_seen": 663538688 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040351053159478435, + "loss": 3.0304, + "theoretical_loss": 3.8001291082171518, + "tokens_seen": 663555072 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004035005015045136, + "loss": 2.9045, + "theoretical_loss": 3.8000906372868224, + "tokens_seen": 663620608 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004034904714142427, + "loss": 3.1653, + "theoretical_loss": 3.8000521712191677, + "tokens_seen": 663686144 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040348044132397195, + "loss": 2.7602, + "theoretical_loss": 3.800013710013094, + "tokens_seen": 663751680 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004034704112337011, + "loss": 3.1523, + "theoretical_loss": 3.7999752536675064, + "tokens_seen": 663817216 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004034603811434303, + "loss": 3.0702, + "theoretical_loss": 3.7999368021813114, + "tokens_seen": 663882752 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004034503510531595, + "loss": 3.1265, + "theoretical_loss": 3.7998983555534145, + "tokens_seen": 663948288 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040344032096288867, + "loss": 3.0968, + "theoretical_loss": 3.7998599137827234, + "tokens_seen": 664013824 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040343029087261785, + "loss": 3.0127, + "theoretical_loss": 3.7998214768681446, + "tokens_seen": 664079360 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040342026078234703, + "loss": 2.9713, + "theoretical_loss": 3.799783044808586, + "tokens_seen": 664144896 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004034102306920762, + "loss": 3.0398, + "theoretical_loss": 3.799744617602954, + "tokens_seen": 664210432 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040340020060180545, + "loss": 2.813, + "theoretical_loss": 3.7997061952501587, + "tokens_seen": 664275968 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004033901705115346, + "loss": 3.0101, + "theoretical_loss": 3.7996677777491072, + "tokens_seen": 664341504 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004033801404212638, + "loss": 3.0215, + "theoretical_loss": 3.799629365098709, + "tokens_seen": 664407040 + }, + { + "epoch": 1.1, + "learning_rate": 0.000403370110330993, + "loss": 3.0085, + "theoretical_loss": 3.7995909572978723, + "tokens_seen": 664472576 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004033600802407222, + "loss": 2.9708, + "theoretical_loss": 3.799552554345508, + "tokens_seen": 664538112 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040335005015045136, + "loss": 3.0787, + "theoretical_loss": 3.799514156240525, + "tokens_seen": 664603648 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040334002006018054, + "loss": 2.8593, + "theoretical_loss": 3.799475762981834, + "tokens_seen": 664669184 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004033299899699097, + "loss": 2.9549, + "theoretical_loss": 3.7994373745683454, + "tokens_seen": 664734720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040331995987963895, + "loss": 2.9316, + "theoretical_loss": 3.79939899099897, + "tokens_seen": 664800256 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004033099297893681, + "loss": 2.7627, + "theoretical_loss": 3.7993606122726193, + "tokens_seen": 664865792 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032998996990973, + "loss": 2.8442, + "theoretical_loss": 3.7993222383882053, + "tokens_seen": 664931328 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040328986960882644, + "loss": 2.9892, + "theoretical_loss": 3.7992838693446394, + "tokens_seen": 664996864 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032798395185557, + "loss": 2.9231, + "theoretical_loss": 3.799245505140834, + "tokens_seen": 665062400 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040326980942828486, + "loss": 2.8546, + "theoretical_loss": 3.7992071457757017, + "tokens_seen": 665127936 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.97348952293396, + "objective/train/theoretical_loss": 3.7991783794265785, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.7991783794265785, + "tokens_seen": 665177088 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040325977933801404, + "loss": 2.9609, + "theoretical_loss": 3.7991687912481567, + "tokens_seen": 665193472 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032497492477432, + "loss": 2.8757, + "theoretical_loss": 3.7991304415571108, + "tokens_seen": 665259008 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032397191574724, + "loss": 3.1732, + "theoretical_loss": 3.7990920967014787, + "tokens_seen": 665324544 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032296890672016, + "loss": 3.0592, + "theoretical_loss": 3.7990537566801743, + "tokens_seen": 665390080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004032196589769308, + "loss": 2.7155, + "theoretical_loss": 3.799015421492112, + "tokens_seen": 665455616 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040320962888665995, + "loss": 2.9653, + "theoretical_loss": 3.798977091136207, + "tokens_seen": 665521152 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031995987963892, + "loss": 2.9889, + "theoretical_loss": 3.798938765611374, + "tokens_seen": 665586688 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040318956870611836, + "loss": 2.8826, + "theoretical_loss": 3.7989004449165282, + "tokens_seen": 665652224 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040317953861584754, + "loss": 3.069, + "theoretical_loss": 3.798862129050586, + "tokens_seen": 665717760 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031695085255767, + "loss": 2.9927, + "theoretical_loss": 3.7988238180124636, + "tokens_seen": 665783296 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031594784353059, + "loss": 2.9472, + "theoretical_loss": 3.7987855118010776, + "tokens_seen": 665848832 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040314944834503514, + "loss": 2.9634, + "theoretical_loss": 3.798747210415345, + "tokens_seen": 665914368 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031394182547643, + "loss": 2.9868, + "theoretical_loss": 3.798708913854182, + "tokens_seen": 665979904 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031293881644935, + "loss": 2.908, + "theoretical_loss": 3.7986706221165076, + "tokens_seen": 666045440 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004031193580742227, + "loss": 2.9253, + "theoretical_loss": 3.7986323352012388, + "tokens_seen": 666110976 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040310932798395187, + "loss": 3.036, + "theoretical_loss": 3.798594053107295, + "tokens_seen": 666176512 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040309929789368105, + "loss": 3.0759, + "theoretical_loss": 3.7985557758335933, + "tokens_seen": 666242048 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004030892678034103, + "loss": 3.0092, + "theoretical_loss": 3.798517503379054, + "tokens_seen": 666307584 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004030792377131394, + "loss": 2.9958, + "theoretical_loss": 3.7984792357425956, + "tokens_seen": 666373120 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040306920762286864, + "loss": 3.0447, + "theoretical_loss": 3.7984409729231383, + "tokens_seen": 666438656 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040305917753259777, + "loss": 3.0931, + "theoretical_loss": 3.7984027149196025, + "tokens_seen": 666504192 + }, + { + "epoch": 1.1, + "learning_rate": 0.000403049147442327, + "loss": 3.0972, + "theoretical_loss": 3.7983644617309076, + "tokens_seen": 666569728 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004030391173520562, + "loss": 2.9054, + "theoretical_loss": 3.798326213355975, + "tokens_seen": 666635264 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040302908726178537, + "loss": 2.9494, + "theoretical_loss": 3.7982879697937255, + "tokens_seen": 666700800 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040301905717151455, + "loss": 2.8682, + "theoretical_loss": 3.7982497310430814, + "tokens_seen": 666766336 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0644009113311768, + "objective/train/theoretical_loss": 3.798221055137065, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.798221055137065, + "tokens_seen": 666815488 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004030090270812438, + "loss": 3.0366, + "theoretical_loss": 3.7982114971029635, + "tokens_seen": 666831872 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004029989969909729, + "loss": 3.016, + "theoretical_loss": 3.7981732679722944, + "tokens_seen": 666897408 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040298896690070215, + "loss": 3.0231, + "theoretical_loss": 3.798135043649996, + "tokens_seen": 666962944 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004029789368104313, + "loss": 3.018, + "theoretical_loss": 3.7980968241349915, + "tokens_seen": 667028480 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004029689067201605, + "loss": 3.0381, + "theoretical_loss": 3.7980586094262043, + "tokens_seen": 667094016 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004029588766298897, + "loss": 2.8677, + "theoretical_loss": 3.798020399522558, + "tokens_seen": 667159552 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040294884653961887, + "loss": 2.852, + "theoretical_loss": 3.7979821944229757, + "tokens_seen": 667225088 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040293881644934805, + "loss": 3.0215, + "theoretical_loss": 3.7979439941263826, + "tokens_seen": 667290624 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040292878635907723, + "loss": 2.9866, + "theoretical_loss": 3.7979057986317026, + "tokens_seen": 667356160 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004029187562688064, + "loss": 2.9357, + "theoretical_loss": 3.797867607937861, + "tokens_seen": 667421696 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040290872617853565, + "loss": 2.9285, + "theoretical_loss": 3.797829422043783, + "tokens_seen": 667487232 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028986960882648, + "loss": 3.2226, + "theoretical_loss": 3.797791240948394, + "tokens_seen": 667552768 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402888665997994, + "loss": 3.0334, + "theoretical_loss": 3.79775306465062, + "tokens_seen": 667618304 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028786359077232, + "loss": 2.8843, + "theoretical_loss": 3.7977148931493874, + "tokens_seen": 667683840 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028686058174524, + "loss": 3.0244, + "theoretical_loss": 3.797676726443622, + "tokens_seen": 667749376 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040285857572718156, + "loss": 3.0767, + "theoretical_loss": 3.797638564532252, + "tokens_seen": 667814912 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040284854563691074, + "loss": 3.0098, + "theoretical_loss": 3.7976004074142047, + "tokens_seen": 667880448 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028385155466399, + "loss": 2.9582, + "theoretical_loss": 3.7975622550884074, + "tokens_seen": 667945984 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040282848545636915, + "loss": 3.0117, + "theoretical_loss": 3.797524107553788, + "tokens_seen": 668011520 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028184553660983, + "loss": 2.7926, + "theoretical_loss": 3.7974859648092747, + "tokens_seen": 668077056 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004028084252758275, + "loss": 3.1399, + "theoretical_loss": 3.797447826853797, + "tokens_seen": 668142592 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040279839518555664, + "loss": 3.0523, + "theoretical_loss": 3.7974096936862827, + "tokens_seen": 668208128 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004027883650952859, + "loss": 3.1037, + "theoretical_loss": 3.7973715653056628, + "tokens_seen": 668273664 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040277833500501506, + "loss": 2.9967, + "theoretical_loss": 3.7973334417108653, + "tokens_seen": 668339200 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040276830491474424, + "loss": 2.9862, + "theoretical_loss": 3.7972953229008217, + "tokens_seen": 668404736 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9639484882354736, + "objective/train/theoretical_loss": 3.79726673693264, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.79726673693264, + "tokens_seen": 668453888 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004027582748244734, + "loss": 2.8456, + "theoretical_loss": 3.797257208874462, + "tokens_seen": 668470272 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004027482447342026, + "loss": 3.0254, + "theoretical_loss": 3.7972190996307162, + "tokens_seen": 668535808 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004027382146439318, + "loss": 2.9356, + "theoretical_loss": 3.7971809951685165, + "tokens_seen": 668601344 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402728184553661, + "loss": 3.0204, + "theoretical_loss": 3.797142895486794, + "tokens_seen": 668666880 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040271815446339015, + "loss": 2.9401, + "theoretical_loss": 3.7971048005844805, + "tokens_seen": 668732416 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004027081243731194, + "loss": 3.0119, + "theoretical_loss": 3.7970667104605083, + "tokens_seen": 668797952 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040269809428284856, + "loss": 2.9116, + "theoretical_loss": 3.7970286251138097, + "tokens_seen": 668863488 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040268806419257774, + "loss": 3.013, + "theoretical_loss": 3.796990544543317, + "tokens_seen": 668929024 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004026780341023069, + "loss": 2.9108, + "theoretical_loss": 3.796952468747965, + "tokens_seen": 668994560 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004026680040120361, + "loss": 3.0807, + "theoretical_loss": 3.7969143977266855, + "tokens_seen": 669060096 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004026579739217653, + "loss": 3.0066, + "theoretical_loss": 3.796876331478413, + "tokens_seen": 669125632 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004026479438314945, + "loss": 2.9168, + "theoretical_loss": 3.7968382700020813, + "tokens_seen": 669191168 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040263791374122365, + "loss": 3.0256, + "theoretical_loss": 3.796800213296626, + "tokens_seen": 669256704 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004026278836509529, + "loss": 3.0311, + "theoretical_loss": 3.7967621613609817, + "tokens_seen": 669322240 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402617853560682, + "loss": 3.0391, + "theoretical_loss": 3.796724114194083, + "tokens_seen": 669387776 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040260782347041125, + "loss": 3.0696, + "theoretical_loss": 3.7966860717948654, + "tokens_seen": 669453312 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040259779338014043, + "loss": 2.9337, + "theoretical_loss": 3.7966480341622653, + "tokens_seen": 669518848 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004025877632898696, + "loss": 3.0074, + "theoretical_loss": 3.7966100012952193, + "tokens_seen": 669584384 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004025777331995988, + "loss": 3.0686, + "theoretical_loss": 3.7965719731926635, + "tokens_seen": 669649920 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040256770310932797, + "loss": 2.9817, + "theoretical_loss": 3.796533949853535, + "tokens_seen": 669715456 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040255767301905715, + "loss": 2.8373, + "theoretical_loss": 3.7964959312767705, + "tokens_seen": 669780992 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004025476429287864, + "loss": 2.9291, + "theoretical_loss": 3.796457917461309, + "tokens_seen": 669846528 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004025376128385155, + "loss": 2.9473, + "theoretical_loss": 3.796419908406087, + "tokens_seen": 669912064 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040252758274824475, + "loss": 2.7608, + "theoretical_loss": 3.796381904110044, + "tokens_seen": 669977600 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040251755265797393, + "loss": 2.9314, + "theoretical_loss": 3.796343904572117, + "tokens_seen": 670043136 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.859225034713745, + "objective/train/theoretical_loss": 3.7963154080405483, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.7963154080405483, + "tokens_seen": 670092288 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004025075225677031, + "loss": 2.9617, + "theoretical_loss": 3.796305909791247, + "tokens_seen": 670108672 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024974924774323, + "loss": 3.0713, + "theoretical_loss": 3.796267919766372, + "tokens_seen": 670174208 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024874623871615, + "loss": 2.973, + "theoretical_loss": 3.796229934496432, + "tokens_seen": 670239744 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040247743229689066, + "loss": 2.9129, + "theoretical_loss": 3.796191953980367, + "tokens_seen": 670305280 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024674022066199, + "loss": 2.9282, + "theoretical_loss": 3.796153978217118, + "tokens_seen": 670370816 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402457372116349, + "loss": 2.9417, + "theoretical_loss": 3.796116007205624, + "tokens_seen": 670436352 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040244734202607825, + "loss": 2.9707, + "theoretical_loss": 3.7960780409448285, + "tokens_seen": 670501888 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024373119358074, + "loss": 3.1246, + "theoretical_loss": 3.796040079433671, + "tokens_seen": 670567424 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024272818455366, + "loss": 3.0935, + "theoretical_loss": 3.7960021226710934, + "tokens_seen": 670632960 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004024172517552658, + "loss": 3.0073, + "theoretical_loss": 3.7959641706560383, + "tokens_seen": 670698496 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402407221664995, + "loss": 2.9193, + "theoretical_loss": 3.795926223387448, + "tokens_seen": 670764032 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023971915747242, + "loss": 2.8935, + "theoretical_loss": 3.795888280864264, + "tokens_seen": 670829568 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023871614844534, + "loss": 2.881, + "theoretical_loss": 3.7958503430854313, + "tokens_seen": 670895104 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023771313941826, + "loss": 3.0358, + "theoretical_loss": 3.7958124100498924, + "tokens_seen": 670960640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040236710130391176, + "loss": 2.9601, + "theoretical_loss": 3.795774481756591, + "tokens_seen": 671026176 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040235707121364094, + "loss": 3.1117, + "theoretical_loss": 3.795736558204471, + "tokens_seen": 671091712 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023470411233701, + "loss": 2.9251, + "theoretical_loss": 3.7956986393924774, + "tokens_seen": 671157248 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040233701103309935, + "loss": 3.1058, + "theoretical_loss": 3.795660725319555, + "tokens_seen": 671222784 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023269809428285, + "loss": 3.0772, + "theoretical_loss": 3.7956228159846477, + "tokens_seen": 671288320 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004023169508525577, + "loss": 2.9484, + "theoretical_loss": 3.795584911386702, + "tokens_seen": 671353856 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040230692076228684, + "loss": 2.9096, + "theoretical_loss": 3.795547011524664, + "tokens_seen": 671419392 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004022968906720161, + "loss": 2.818, + "theoretical_loss": 3.7955091163974783, + "tokens_seen": 671484928 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040228686058174526, + "loss": 2.9683, + "theoretical_loss": 3.795471226004093, + "tokens_seen": 671550464 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040227683049147444, + "loss": 3.0174, + "theoretical_loss": 3.795433340343454, + "tokens_seen": 671616000 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004022668004012036, + "loss": 2.8566, + "theoretical_loss": 3.7953954594145083, + "tokens_seen": 671681536 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.130408525466919, + "objective/train/theoretical_loss": 3.79536705182234, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.79536705182234, + "tokens_seen": 671730688 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004022567703109328, + "loss": 3.1319, + "theoretical_loss": 3.7953575832162034, + "tokens_seen": 671747072 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402246740220662, + "loss": 3.03, + "theoretical_loss": 3.7953197117474877, + "tokens_seen": 671812608 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004022367101303912, + "loss": 2.982, + "theoretical_loss": 3.7952818450073087, + "tokens_seen": 671878144 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040222668004012035, + "loss": 2.9706, + "theoretical_loss": 3.795243982994615, + "tokens_seen": 671943680 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004022166499498496, + "loss": 2.8919, + "theoretical_loss": 3.795206125708356, + "tokens_seen": 672009216 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040220661985957876, + "loss": 3.1161, + "theoretical_loss": 3.79516827314748, + "tokens_seen": 672074752 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040219658976930794, + "loss": 2.9634, + "theoretical_loss": 3.795130425310937, + "tokens_seen": 672140288 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021865596790371, + "loss": 2.9902, + "theoretical_loss": 3.7950925821976766, + "tokens_seen": 672205824 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021765295887663, + "loss": 2.9786, + "theoretical_loss": 3.7950547438066486, + "tokens_seen": 672271360 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021664994984955, + "loss": 3.0032, + "theoretical_loss": 3.795016910136804, + "tokens_seen": 672336896 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021564694082247, + "loss": 3.2141, + "theoretical_loss": 3.794979081187094, + "tokens_seen": 672402432 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040214643931795385, + "loss": 3.1511, + "theoretical_loss": 3.7949412569564682, + "tokens_seen": 672467968 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021364092276831, + "loss": 3.0448, + "theoretical_loss": 3.79490343744388, + "tokens_seen": 672533504 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004021263791374122, + "loss": 2.9104, + "theoretical_loss": 3.79486562264828, + "tokens_seen": 672599040 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040211634904714145, + "loss": 3.041, + "theoretical_loss": 3.7948278125686206, + "tokens_seen": 672664576 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040210631895687063, + "loss": 2.9221, + "theoretical_loss": 3.7947900072038547, + "tokens_seen": 672730112 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004020962888665998, + "loss": 3.1136, + "theoretical_loss": 3.7947522065529347, + "tokens_seen": 672795648 + }, + { + "epoch": 1.1, + "learning_rate": 0.000402086258776329, + "loss": 2.958, + "theoretical_loss": 3.794714410614813, + "tokens_seen": 672861184 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040207622868605817, + "loss": 2.9582, + "theoretical_loss": 3.7946766193884454, + "tokens_seen": 672926720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040206619859578735, + "loss": 2.8008, + "theoretical_loss": 3.794638832872783, + "tokens_seen": 672992256 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004020561685055166, + "loss": 3.0991, + "theoretical_loss": 3.794601051066782, + "tokens_seen": 673057792 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004020461384152457, + "loss": 2.9839, + "theoretical_loss": 3.7945632739693957, + "tokens_seen": 673123328 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040203610832497495, + "loss": 2.925, + "theoretical_loss": 3.79452550157958, + "tokens_seen": 673188864 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040202607823470413, + "loss": 2.9432, + "theoretical_loss": 3.794487733896289, + "tokens_seen": 673254400 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004020160481444333, + "loss": 3.0493, + "theoretical_loss": 3.7944499709184782, + "tokens_seen": 673319936 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9047727584838867, + "objective/train/theoretical_loss": 3.7944216517724643, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.7944216517724643, + "tokens_seen": 673369088 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004020060180541625, + "loss": 2.8406, + "theoretical_loss": 3.7944122126451045, + "tokens_seen": 673385472 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019959879638917, + "loss": 3.1231, + "theoretical_loss": 3.794374459075123, + "tokens_seen": 673451008 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040198595787362086, + "loss": 3.2678, + "theoretical_loss": 3.7943367102074914, + "tokens_seen": 673516544 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019759277833501, + "loss": 2.9504, + "theoretical_loss": 3.794298966041165, + "tokens_seen": 673582080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019658976930792, + "loss": 3.1024, + "theoretical_loss": 3.7942612265751023, + "tokens_seen": 673647616 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040195586760280845, + "loss": 2.937, + "theoretical_loss": 3.7942234918082596, + "tokens_seen": 673713152 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019458375125376, + "loss": 2.9349, + "theoretical_loss": 3.794185761739596, + "tokens_seen": 673778688 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019358074222668, + "loss": 2.8691, + "theoretical_loss": 3.7941480363680684, + "tokens_seen": 673844224 + }, + { + "epoch": 1.1, + "learning_rate": 0.000401925777331996, + "loss": 3.0564, + "theoretical_loss": 3.7941103156926363, + "tokens_seen": 673909760 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004019157472417252, + "loss": 2.8957, + "theoretical_loss": 3.7940725997122584, + "tokens_seen": 673975296 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040190571715145436, + "loss": 2.8569, + "theoretical_loss": 3.794034888425893, + "tokens_seen": 674040832 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018956870611836, + "loss": 2.9594, + "theoretical_loss": 3.7939971818325002, + "tokens_seen": 674106368 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018856569709127, + "loss": 3.0341, + "theoretical_loss": 3.7939594799310408, + "tokens_seen": 674171904 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040187562688064196, + "loss": 3.0612, + "theoretical_loss": 3.793921782720473, + "tokens_seen": 674237440 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018655967903711, + "loss": 3.0312, + "theoretical_loss": 3.793884090199758, + "tokens_seen": 674302976 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018555667001003, + "loss": 2.8775, + "theoretical_loss": 3.7938464023678575, + "tokens_seen": 674368512 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018455366098295, + "loss": 3.0729, + "theoretical_loss": 3.7938087192237324, + "tokens_seen": 674434048 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018355065195587, + "loss": 2.8645, + "theoretical_loss": 3.793771040766343, + "tokens_seen": 674499584 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040182547642928786, + "loss": 2.8023, + "theoretical_loss": 3.7937333669946525, + "tokens_seen": 674565120 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040181544633901704, + "loss": 3.0418, + "theoretical_loss": 3.793695697907622, + "tokens_seen": 674630656 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004018054162487462, + "loss": 3.1099, + "theoretical_loss": 3.7936580335042147, + "tokens_seen": 674696192 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040179538615847546, + "loss": 2.9786, + "theoretical_loss": 3.7936203737833933, + "tokens_seen": 674761728 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004017853560682046, + "loss": 2.8935, + "theoretical_loss": 3.7935827187441205, + "tokens_seen": 674827264 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004017753259779338, + "loss": 2.9116, + "theoretical_loss": 3.79354506838536, + "tokens_seen": 674892800 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040176529588766295, + "loss": 2.9872, + "theoretical_loss": 3.7935074227060754, + "tokens_seen": 674958336 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.798358917236328, + "objective/train/theoretical_loss": 3.7934791915168953, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.7934791915168953, + "tokens_seen": 675007488 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004017552657973922, + "loss": 3.0348, + "theoretical_loss": 3.793469781705231, + "tokens_seen": 675023872 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040174523570712137, + "loss": 2.9462, + "theoretical_loss": 3.793432145381792, + "tokens_seen": 675089408 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040173520561685055, + "loss": 2.9591, + "theoretical_loss": 3.793394513734722, + "tokens_seen": 675154944 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040172517552657973, + "loss": 2.9918, + "theoretical_loss": 3.7933568867629868, + "tokens_seen": 675220480 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040171514543630896, + "loss": 2.9364, + "theoretical_loss": 3.793319264465551, + "tokens_seen": 675286016 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004017051153460381, + "loss": 2.9184, + "theoretical_loss": 3.7932816468413817, + "tokens_seen": 675351552 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004016950852557673, + "loss": 3.0524, + "theoretical_loss": 3.7932440338894438, + "tokens_seen": 675417088 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040168505516549645, + "loss": 2.8073, + "theoretical_loss": 3.793206425608705, + "tokens_seen": 675482624 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004016750250752257, + "loss": 2.8904, + "theoretical_loss": 3.7931688219981305, + "tokens_seen": 675548160 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040166499498495487, + "loss": 2.89, + "theoretical_loss": 3.7931312230566885, + "tokens_seen": 675613696 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040165496489468405, + "loss": 2.7991, + "theoretical_loss": 3.793093628783346, + "tokens_seen": 675679232 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004016449348044133, + "loss": 2.8354, + "theoretical_loss": 3.793056039177071, + "tokens_seen": 675744768 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004016349047141424, + "loss": 2.9576, + "theoretical_loss": 3.7930184542368313, + "tokens_seen": 675810304 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040162487462387165, + "loss": 2.9226, + "theoretical_loss": 3.7929808739615956, + "tokens_seen": 675875840 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040161484453360083, + "loss": 3.0325, + "theoretical_loss": 3.792943298350332, + "tokens_seen": 675941376 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040160481444333, + "loss": 3.0913, + "theoretical_loss": 3.7929057274020104, + "tokens_seen": 676006912 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015947843530592, + "loss": 2.7124, + "theoretical_loss": 3.7928681611155994, + "tokens_seen": 676072448 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040158475426278837, + "loss": 2.9317, + "theoretical_loss": 3.7928305994900695, + "tokens_seen": 676137984 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040157472417251755, + "loss": 3.0415, + "theoretical_loss": 3.7927930425243903, + "tokens_seen": 676203520 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015646940822468, + "loss": 3.0635, + "theoretical_loss": 3.792755490217532, + "tokens_seen": 676269056 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015546639919759, + "loss": 2.9533, + "theoretical_loss": 3.7927179425684656, + "tokens_seen": 676334592 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040154463390170515, + "loss": 2.9813, + "theoretical_loss": 3.792680399576162, + "tokens_seen": 676400128 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040153460381143433, + "loss": 2.9449, + "theoretical_loss": 3.7926428612395924, + "tokens_seen": 676465664 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015245737211635, + "loss": 3.0431, + "theoretical_loss": 3.7926053275577285, + "tokens_seen": 676531200 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015145436308927, + "loss": 2.9951, + "theoretical_loss": 3.792567798529543, + "tokens_seen": 676596736 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0657544136047363, + "objective/train/theoretical_loss": 3.792539654811762, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.792539654811762, + "tokens_seen": 676645888 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004015045135406219, + "loss": 3.048, + "theoretical_loss": 3.7925302741540072, + "tokens_seen": 676662272 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040149448345035106, + "loss": 3.0247, + "theoretical_loss": 3.7924927544300946, + "tokens_seen": 676727808 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014844533600803, + "loss": 2.9552, + "theoretical_loss": 3.7924552393567774, + "tokens_seen": 676793344 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014744232698094, + "loss": 2.9172, + "theoretical_loss": 3.79241772893303, + "tokens_seen": 676858880 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040146439317953865, + "loss": 3.0389, + "theoretical_loss": 3.7923802231578247, + "tokens_seen": 676924416 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014543630892678, + "loss": 2.9967, + "theoretical_loss": 3.7923427220301367, + "tokens_seen": 676989952 + }, + { + "epoch": 1.1, + "learning_rate": 0.000401444332998997, + "loss": 3.0426, + "theoretical_loss": 3.792305225548939, + "tokens_seen": 677055488 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014343029087262, + "loss": 2.8993, + "theoretical_loss": 3.7922677337132074, + "tokens_seen": 677121024 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014242728184554, + "loss": 3.116, + "theoretical_loss": 3.7922302465219158, + "tokens_seen": 677186560 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040141424272818456, + "loss": 2.9581, + "theoretical_loss": 3.7921927639740405, + "tokens_seen": 677252096 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004014042126379138, + "loss": 2.9955, + "theoretical_loss": 3.7921552860685566, + "tokens_seen": 677317632 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013941825476429, + "loss": 3.0224, + "theoretical_loss": 3.7921178128044395, + "tokens_seen": 677383168 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040138415245737216, + "loss": 3.0125, + "theoretical_loss": 3.7920803441806665, + "tokens_seen": 677448704 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013741223671013, + "loss": 2.9824, + "theoretical_loss": 3.792042880196213, + "tokens_seen": 677514240 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013640922768305, + "loss": 2.9046, + "theoretical_loss": 3.7920054208500567, + "tokens_seen": 677579776 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013540621865597, + "loss": 2.8934, + "theoretical_loss": 3.7919679661411747, + "tokens_seen": 677645312 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013440320962889, + "loss": 3.0623, + "theoretical_loss": 3.7919305160685446, + "tokens_seen": 677710848 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040133400200601806, + "loss": 2.9535, + "theoretical_loss": 3.7918930706311444, + "tokens_seen": 677776384 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040132397191574724, + "loss": 2.9555, + "theoretical_loss": 3.791855629827951, + "tokens_seen": 677841920 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004013139418254764, + "loss": 2.9403, + "theoretical_loss": 3.7918181936579445, + "tokens_seen": 677907456 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040130391173520566, + "loss": 3.0529, + "theoretical_loss": 3.791780762120103, + "tokens_seen": 677972992 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004012938816449348, + "loss": 2.9218, + "theoretical_loss": 3.791743335213406, + "tokens_seen": 678038528 + }, + { + "epoch": 1.1, + "learning_rate": 0.000401283851554664, + "loss": 3.0307, + "theoretical_loss": 3.7917059129368322, + "tokens_seen": 678104064 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040127382146439315, + "loss": 2.8847, + "theoretical_loss": 3.7916684952893625, + "tokens_seen": 678169600 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004012637913741224, + "loss": 2.9443, + "theoretical_loss": 3.7916310822699764, + "tokens_seen": 678235136 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8735456466674805, + "objective/train/theoretical_loss": 3.7916030255420035, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.7916030255420035, + "tokens_seen": 678284288 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040125376128385157, + "loss": 2.937, + "theoretical_loss": 3.791593673877654, + "tokens_seen": 678300672 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040124373119358075, + "loss": 3.1022, + "theoretical_loss": 3.7915562701113767, + "tokens_seen": 678366208 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040123370110330993, + "loss": 2.7806, + "theoretical_loss": 3.791518870970126, + "tokens_seen": 678431744 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040122367101303916, + "loss": 2.9503, + "theoretical_loss": 3.791481476452882, + "tokens_seen": 678497280 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004012136409227683, + "loss": 2.9241, + "theoretical_loss": 3.791444086558627, + "tokens_seen": 678562816 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004012036108324975, + "loss": 2.9662, + "theoretical_loss": 3.791406701286344, + "tokens_seen": 678628352 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040119358074222665, + "loss": 2.8655, + "theoretical_loss": 3.791369320635014, + "tokens_seen": 678693888 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004011835506519559, + "loss": 2.9372, + "theoretical_loss": 3.7913319446036207, + "tokens_seen": 678759424 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040117352056168507, + "loss": 2.9772, + "theoretical_loss": 3.7912945731911467, + "tokens_seen": 678824960 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040116349047141425, + "loss": 2.9594, + "theoretical_loss": 3.7912572063965753, + "tokens_seen": 678890496 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040115346038114343, + "loss": 3.1587, + "theoretical_loss": 3.79121984421889, + "tokens_seen": 678956032 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004011434302908726, + "loss": 3.0445, + "theoretical_loss": 3.791182486657075, + "tokens_seen": 679021568 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004011334002006018, + "loss": 2.8823, + "theoretical_loss": 3.791145133710115, + "tokens_seen": 679087104 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040112337011033103, + "loss": 3.1779, + "theoretical_loss": 3.7911077853769948, + "tokens_seen": 679152640 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040111334002006016, + "loss": 2.9022, + "theoretical_loss": 3.7910704416566983, + "tokens_seen": 679218176 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004011033099297894, + "loss": 2.8836, + "theoretical_loss": 3.7910331025482114, + "tokens_seen": 679283712 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004010932798395185, + "loss": 3.1957, + "theoretical_loss": 3.7909957680505197, + "tokens_seen": 679349248 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040108324974924775, + "loss": 2.8583, + "theoretical_loss": 3.7909584381626087, + "tokens_seen": 679414784 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040107321965897693, + "loss": 3.0602, + "theoretical_loss": 3.7909211128834652, + "tokens_seen": 679480320 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004010631895687061, + "loss": 3.0166, + "theoretical_loss": 3.7908837922120764, + "tokens_seen": 679545856 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004010531594784353, + "loss": 3.0692, + "theoretical_loss": 3.7908464761474274, + "tokens_seen": 679611392 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040104312938816453, + "loss": 2.9244, + "theoretical_loss": 3.7908091646885067, + "tokens_seen": 679676928 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040103309929789366, + "loss": 3.0165, + "theoretical_loss": 3.7907718578343017, + "tokens_seen": 679742464 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004010230692076229, + "loss": 3.0311, + "theoretical_loss": 3.7907345555838, + "tokens_seen": 679808000 + }, + { + "epoch": 1.1, + "learning_rate": 0.000401013039117352, + "loss": 2.8449, + "theoretical_loss": 3.7906972579359897, + "tokens_seen": 679873536 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.823486089706421, + "objective/train/theoretical_loss": 3.79066928772004, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.79066928772004, + "tokens_seen": 679922688 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040100300902708126, + "loss": 2.8913, + "theoretical_loss": 3.7906599648898593, + "tokens_seen": 679939072 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040099297893681044, + "loss": 3.2507, + "theoretical_loss": 3.790622676444398, + "tokens_seen": 680004608 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009829488465396, + "loss": 2.8862, + "theoretical_loss": 3.7905853925985946, + "tokens_seen": 680070144 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009729187562688, + "loss": 2.9818, + "theoretical_loss": 3.7905481133514387, + "tokens_seen": 680135680 + }, + { + "epoch": 1.1, + "learning_rate": 0.000400962888665998, + "loss": 2.9382, + "theoretical_loss": 3.7905108387019197, + "tokens_seen": 680201216 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040095285857572716, + "loss": 3.0179, + "theoretical_loss": 3.790473568649028, + "tokens_seen": 680266752 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009428284854564, + "loss": 2.9214, + "theoretical_loss": 3.790436303191754, + "tokens_seen": 680332288 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009327983951855, + "loss": 2.9478, + "theoretical_loss": 3.7903990423290885, + "tokens_seen": 680397824 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040092276830491476, + "loss": 2.8384, + "theoretical_loss": 3.7903617860600223, + "tokens_seen": 680463360 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009127382146439, + "loss": 3.0308, + "theoretical_loss": 3.7903245343835468, + "tokens_seen": 680528896 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004009027081243731, + "loss": 3.0147, + "theoretical_loss": 3.7902872872986535, + "tokens_seen": 680594432 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040089267803410236, + "loss": 2.938, + "theoretical_loss": 3.790250044804335, + "tokens_seen": 680659968 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004008826479438315, + "loss": 3.0052, + "theoretical_loss": 3.7902128068995835, + "tokens_seen": 680725504 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004008726178535607, + "loss": 2.9489, + "theoretical_loss": 3.790175573583391, + "tokens_seen": 680791040 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004008625877632899, + "loss": 2.9534, + "theoretical_loss": 3.7901383448547508, + "tokens_seen": 680856576 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004008525576730191, + "loss": 3.159, + "theoretical_loss": 3.7901011207126567, + "tokens_seen": 680922112 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040084252758274826, + "loss": 2.7603, + "theoretical_loss": 3.790063901156101, + "tokens_seen": 680987648 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040083249749247744, + "loss": 3.0566, + "theoretical_loss": 3.7900266861840786, + "tokens_seen": 681053184 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004008224674022066, + "loss": 3.0052, + "theoretical_loss": 3.7899894757955837, + "tokens_seen": 681118720 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040081243731193586, + "loss": 2.9409, + "theoretical_loss": 3.7899522699896107, + "tokens_seen": 681184256 + }, + { + "epoch": 1.1, + "learning_rate": 0.000400802407221665, + "loss": 2.9018, + "theoretical_loss": 3.789915068765155, + "tokens_seen": 681249792 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004007923771313942, + "loss": 3.2266, + "theoretical_loss": 3.7898778721212105, + "tokens_seen": 681315328 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040078234704112335, + "loss": 2.7013, + "theoretical_loss": 3.7898406800567734, + "tokens_seen": 681380864 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004007723169508526, + "loss": 3.0432, + "theoretical_loss": 3.7898034925708393, + "tokens_seen": 681446400 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040076228686058177, + "loss": 2.9992, + "theoretical_loss": 3.7897663096624052, + "tokens_seen": 681511936 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.002262830734253, + "objective/train/theoretical_loss": 3.789738425484459, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.789738425484459, + "tokens_seen": 681561088 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040075225677031095, + "loss": 2.9335, + "theoretical_loss": 3.7897291313304664, + "tokens_seen": 681577472 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040074222668004013, + "loss": 3.0209, + "theoretical_loss": 3.78969195757402, + "tokens_seen": 681643008 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040073219658976936, + "loss": 3.0287, + "theoretical_loss": 3.7896547883920633, + "tokens_seen": 681708544 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004007221664994985, + "loss": 2.8736, + "theoretical_loss": 3.789617623783594, + "tokens_seen": 681774080 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004007121364092277, + "loss": 3.0289, + "theoretical_loss": 3.789580463747609, + "tokens_seen": 681839616 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040070210631895685, + "loss": 2.7963, + "theoretical_loss": 3.7895433082831067, + "tokens_seen": 681905152 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004006920762286861, + "loss": 3.1223, + "theoretical_loss": 3.789506157389085, + "tokens_seen": 681970688 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040068204613841527, + "loss": 2.9818, + "theoretical_loss": 3.789469011064544, + "tokens_seen": 682036224 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040067201604814445, + "loss": 3.1048, + "theoretical_loss": 3.7894318693084807, + "tokens_seen": 682101760 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040066198595787363, + "loss": 2.8058, + "theoretical_loss": 3.789394732119896, + "tokens_seen": 682167296 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004006519558676028, + "loss": 2.9376, + "theoretical_loss": 3.789357599497789, + "tokens_seen": 682232832 + }, + { + "epoch": 1.1, + "learning_rate": 0.000400641925777332, + "loss": 2.9427, + "theoretical_loss": 3.789320471441159, + "tokens_seen": 682298368 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040063189568706123, + "loss": 3.1207, + "theoretical_loss": 3.7892833479490067, + "tokens_seen": 682363904 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040062186559679036, + "loss": 2.8224, + "theoretical_loss": 3.789246229020333, + "tokens_seen": 682429440 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004006118355065196, + "loss": 2.8495, + "theoretical_loss": 3.789209114654138, + "tokens_seen": 682494976 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004006018054162487, + "loss": 3.099, + "theoretical_loss": 3.7891720048494233, + "tokens_seen": 682560512 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040059177532597795, + "loss": 3.1037, + "theoretical_loss": 3.7891348996051906, + "tokens_seen": 682626048 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040058174523570713, + "loss": 2.963, + "theoretical_loss": 3.7890977989204413, + "tokens_seen": 682691584 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004005717151454363, + "loss": 2.9567, + "theoretical_loss": 3.7890607027941776, + "tokens_seen": 682757120 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004005616850551655, + "loss": 3.0641, + "theoretical_loss": 3.7890236112254025, + "tokens_seen": 682822656 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040055165496489473, + "loss": 2.9542, + "theoretical_loss": 3.7889865242131178, + "tokens_seen": 682888192 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040054162487462386, + "loss": 3.0991, + "theoretical_loss": 3.788949441756327, + "tokens_seen": 682953728 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004005315947843531, + "loss": 2.8292, + "theoretical_loss": 3.788912363854034, + "tokens_seen": 683019264 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004005215646940822, + "loss": 2.9929, + "theoretical_loss": 3.7888752905052416, + "tokens_seen": 683084800 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040051153460381146, + "loss": 3.047, + "theoretical_loss": 3.7888382217089545, + "tokens_seen": 683150336 + }, + { + "epoch": 1.1, + "objective/train/docs_used": 1075722, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0768442153930664, + "objective/train/theoretical_loss": 3.788810423098721, + "objective/train/tokens_used": 683395552, + "theoretical_loss": 3.788810423098721, + "tokens_seen": 683199488 + }, + { + "epoch": 1.1, + "learning_rate": 0.00040050150451354064, + "loss": 2.9136, + "theoretical_loss": 3.7888011574641762, + "tokens_seen": 683215872 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004004914744232698, + "loss": 3.0733, + "theoretical_loss": 3.788764097769912, + "tokens_seen": 683281408 + }, + { + "epoch": 1.1, + "learning_rate": 0.000400481444332999, + "loss": 2.912, + "theoretical_loss": 3.788727042625167, + "tokens_seen": 683346944 + }, + { + "epoch": 1.1, + "learning_rate": 0.0004004714142427282, + "loss": 2.8385, + "theoretical_loss": 3.788689992028946, + "tokens_seen": 683412480 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040046138415245736, + "loss": 3.6256, + "theoretical_loss": 3.7886512095582843, + "tokens_seen": 683481088 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004004513540621866, + "loss": 2.9572, + "theoretical_loss": 3.788614168269223, + "tokens_seen": 683546624 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004004413239719157, + "loss": 2.8632, + "theoretical_loss": 3.7885771315256567, + "tokens_seen": 683612160 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040043129388164496, + "loss": 2.9331, + "theoretical_loss": 3.7885400993265925, + "tokens_seen": 683677696 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004004212637913741, + "loss": 2.9842, + "theoretical_loss": 3.788503071671036, + "tokens_seen": 683743232 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004004112337011033, + "loss": 3.0619, + "theoretical_loss": 3.7884660485579964, + "tokens_seen": 683808768 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004004012036108325, + "loss": 3.0531, + "theoretical_loss": 3.788429029986479, + "tokens_seen": 683874304 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004003911735205617, + "loss": 3.0719, + "theoretical_loss": 3.788392015955493, + "tokens_seen": 683939840 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040038114343029087, + "loss": 2.9995, + "theoretical_loss": 3.7883550064640454, + "tokens_seen": 684005376 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004003711133400201, + "loss": 2.9646, + "theoretical_loss": 3.7883180015111457, + "tokens_seen": 684070912 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040036108324974923, + "loss": 2.9994, + "theoretical_loss": 3.788281001095802, + "tokens_seen": 684136448 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040035105315947846, + "loss": 3.058, + "theoretical_loss": 3.788244005217023, + "tokens_seen": 684201984 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004003410230692076, + "loss": 3.0138, + "theoretical_loss": 3.7882070138738193, + "tokens_seen": 684267520 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004003309929789368, + "loss": 3.0544, + "theoretical_loss": 3.7881700270651986, + "tokens_seen": 684333056 + }, + { + "epoch": 2.0, + "learning_rate": 0.000400320962888666, + "loss": 3.1616, + "theoretical_loss": 3.7881330447901727, + "tokens_seen": 684398592 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004003109327983952, + "loss": 2.9558, + "theoretical_loss": 3.788096067047751, + "tokens_seen": 684464128 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040030090270812437, + "loss": 3.0498, + "theoretical_loss": 3.7880590938369436, + "tokens_seen": 684529664 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040029087261785355, + "loss": 3.1444, + "theoretical_loss": 3.788022125156762, + "tokens_seen": 684595200 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040028084252758273, + "loss": 2.9309, + "theoretical_loss": 3.7879851610062176, + "tokens_seen": 684660736 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040027081243731197, + "loss": 2.845, + "theoretical_loss": 3.7879482013843213, + "tokens_seen": 684726272 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004002607823470411, + "loss": 3.176, + "theoretical_loss": 3.7879112462900855, + "tokens_seen": 684791808 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1110494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.297323703765869, + "objective/train/theoretical_loss": 3.7878927704405316, + "objective/train/tokens_used": 705284576, + "theoretical_loss": 3.7878927704405316, + "tokens_seen": 684824576 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040025075225677033, + "loss": 3.1659, + "theoretical_loss": 3.787874295722522, + "tokens_seen": 684857344 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040024072216649946, + "loss": 2.9501, + "theoretical_loss": 3.7878373496806432, + "tokens_seen": 684922880 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004002306920762287, + "loss": 3.0822, + "theoretical_loss": 3.787800408163462, + "tokens_seen": 684988416 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040022066198595787, + "loss": 2.8077, + "theoretical_loss": 3.787763471169991, + "tokens_seen": 685053952 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040021063189568705, + "loss": 3.038, + "theoretical_loss": 3.7877265386992445, + "tokens_seen": 685119488 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040020060180541623, + "loss": 2.9386, + "theoretical_loss": 3.7876896107502356, + "tokens_seen": 685185024 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040019057171514547, + "loss": 3.0526, + "theoretical_loss": 3.7876526873219776, + "tokens_seen": 685250560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004001805416248746, + "loss": 3.089, + "theoretical_loss": 3.787615768413486, + "tokens_seen": 685316096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040017051153460383, + "loss": 3.0959, + "theoretical_loss": 3.7875788540237747, + "tokens_seen": 685381632 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040016048144433296, + "loss": 2.9487, + "theoretical_loss": 3.787541944151859, + "tokens_seen": 685447168 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004001504513540622, + "loss": 2.9623, + "theoretical_loss": 3.7875050387967533, + "tokens_seen": 685512704 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040014042126379143, + "loss": 2.8283, + "theoretical_loss": 3.787468137957474, + "tokens_seen": 685578240 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040013039117352056, + "loss": 2.9542, + "theoretical_loss": 3.787431241633037, + "tokens_seen": 685643776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004001203610832498, + "loss": 2.9124, + "theoretical_loss": 3.7873943498224576, + "tokens_seen": 685709312 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004001103309929789, + "loss": 2.903, + "theoretical_loss": 3.787357462524753, + "tokens_seen": 685774848 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040010030090270815, + "loss": 2.9698, + "theoretical_loss": 3.7873205797389398, + "tokens_seen": 685840384 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040009027081243733, + "loss": 3.0188, + "theoretical_loss": 3.787283701464035, + "tokens_seen": 685905920 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004000802407221665, + "loss": 2.9344, + "theoretical_loss": 3.787246827699055, + "tokens_seen": 685971456 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004000702106318957, + "loss": 2.9296, + "theoretical_loss": 3.787209958443019, + "tokens_seen": 686036992 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040006018054162493, + "loss": 3.0169, + "theoretical_loss": 3.7871730936949453, + "tokens_seen": 686102528 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040005015045135406, + "loss": 3.0696, + "theoretical_loss": 3.7871362334538503, + "tokens_seen": 686168064 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004000401203610833, + "loss": 3.0761, + "theoretical_loss": 3.7870993777187536, + "tokens_seen": 686233600 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004000300902708124, + "loss": 3.0935, + "theoretical_loss": 3.7870625264886746, + "tokens_seen": 686299136 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040002006018054166, + "loss": 3.0582, + "theoretical_loss": 3.787025679762632, + "tokens_seen": 686364672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00040001003009027084, + "loss": 3.0499, + "theoretical_loss": 3.786988837539645, + "tokens_seen": 686430208 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1113529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1577889919281006, + "objective/train/theoretical_loss": 3.7869704181164914, + "objective/train/tokens_used": 706922976, + "theoretical_loss": 3.7869704181164914, + "tokens_seen": 686462976 + }, + { + "epoch": 2.0, + "learning_rate": 0.0004, + "loss": 3.0584, + "theoretical_loss": 3.7869519998187338, + "tokens_seen": 686495744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999899699097292, + "loss": 3.04, + "theoretical_loss": 3.786915166598919, + "tokens_seen": 686561280 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999799398194584, + "loss": 3.0314, + "theoretical_loss": 3.7868783378792203, + "tokens_seen": 686626816 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039996990972918756, + "loss": 3.005, + "theoretical_loss": 3.7868415136586595, + "tokens_seen": 686692352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999598796389168, + "loss": 2.8792, + "theoretical_loss": 3.786804693936256, + "tokens_seen": 686757888 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999498495486459, + "loss": 3.0321, + "theoretical_loss": 3.786767878711032, + "tokens_seen": 686823424 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039993981945837516, + "loss": 2.9793, + "theoretical_loss": 3.78673106798201, + "tokens_seen": 686888960 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999297893681043, + "loss": 2.9884, + "theoretical_loss": 3.786694261748211, + "tokens_seen": 686954496 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999197592778335, + "loss": 2.9827, + "theoretical_loss": 3.7866574600086573, + "tokens_seen": 687020032 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003999097291875627, + "loss": 3.0376, + "theoretical_loss": 3.786620662762372, + "tokens_seen": 687085568 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003998996990972919, + "loss": 3.043, + "theoretical_loss": 3.7865838700083776, + "tokens_seen": 687151104 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039988966900702107, + "loss": 2.9747, + "theoretical_loss": 3.7865470817456974, + "tokens_seen": 687216640 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003998796389167503, + "loss": 3.1452, + "theoretical_loss": 3.786510297973355, + "tokens_seen": 687282176 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039986960882647943, + "loss": 3.0495, + "theoretical_loss": 3.786473518690374, + "tokens_seen": 687347712 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039985957873620866, + "loss": 3.089, + "theoretical_loss": 3.786436743895779, + "tokens_seen": 687413248 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003998495486459378, + "loss": 3.0732, + "theoretical_loss": 3.7863999735885936, + "tokens_seen": 687478784 + }, + { + "epoch": 2.0, + "learning_rate": 0.000399839518555667, + "loss": 2.9395, + "theoretical_loss": 3.7863632077678435, + "tokens_seen": 687544320 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003998294884653962, + "loss": 3.0214, + "theoretical_loss": 3.786326446432553, + "tokens_seen": 687609856 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003998194583751254, + "loss": 2.97, + "theoretical_loss": 3.7862896895817477, + "tokens_seen": 687675392 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039980942828485457, + "loss": 3.0973, + "theoretical_loss": 3.7862529372144538, + "tokens_seen": 687740928 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039979939819458375, + "loss": 2.8003, + "theoretical_loss": 3.7862161893296955, + "tokens_seen": 687806464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039978936810431293, + "loss": 2.7301, + "theoretical_loss": 3.786179445926501, + "tokens_seen": 687872000 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039977933801404217, + "loss": 3.0567, + "theoretical_loss": 3.786142707003896, + "tokens_seen": 687937536 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003997693079237713, + "loss": 3.0755, + "theoretical_loss": 3.7861059725609074, + "tokens_seen": 688003072 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039975927783350053, + "loss": 3.1333, + "theoretical_loss": 3.7860692425965627, + "tokens_seen": 688068608 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1116343, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2549080848693848, + "objective/train/theoretical_loss": 3.786050879293578, + "objective/train/tokens_used": 708561376, + "theoretical_loss": 3.786050879293578, + "tokens_seen": 688101376 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039974924774322966, + "loss": 3.041, + "theoretical_loss": 3.786032517109889, + "tokens_seen": 688134144 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003997392176529589, + "loss": 2.7969, + "theoretical_loss": 3.7859957960999147, + "tokens_seen": 688199680 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039972918756268807, + "loss": 2.9915, + "theoretical_loss": 3.785959079565666, + "tokens_seen": 688265216 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039971915747241725, + "loss": 3.1507, + "theoretical_loss": 3.785922367506174, + "tokens_seen": 688330752 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039970912738214643, + "loss": 3.024, + "theoretical_loss": 3.7858856599204653, + "tokens_seen": 688396288 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039969909729187567, + "loss": 3.0555, + "theoretical_loss": 3.7858489568075697, + "tokens_seen": 688461824 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003996890672016048, + "loss": 2.6951, + "theoretical_loss": 3.7858122581665166, + "tokens_seen": 688527360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039967903711133403, + "loss": 3.1449, + "theoretical_loss": 3.785775563996335, + "tokens_seen": 688592896 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039966900702106316, + "loss": 3.0415, + "theoretical_loss": 3.785738874296056, + "tokens_seen": 688658432 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003996589769307924, + "loss": 3.0497, + "theoretical_loss": 3.7857021890647085, + "tokens_seen": 688723968 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003996489468405216, + "loss": 3.1652, + "theoretical_loss": 3.785665508301323, + "tokens_seen": 688789504 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039963891675025076, + "loss": 3.0063, + "theoretical_loss": 3.785628832004931, + "tokens_seen": 688855040 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039962888665997994, + "loss": 3.0, + "theoretical_loss": 3.7855921601745637, + "tokens_seen": 688920576 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003996188565697091, + "loss": 3.0375, + "theoretical_loss": 3.7855554928092525, + "tokens_seen": 688986112 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003996088264794383, + "loss": 3.0924, + "theoretical_loss": 3.7855188299080282, + "tokens_seen": 689051648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039959879638916754, + "loss": 2.9744, + "theoretical_loss": 3.7854821714699236, + "tokens_seen": 689117184 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039958876629889666, + "loss": 3.1441, + "theoretical_loss": 3.7854455174939714, + "tokens_seen": 689182720 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003995787362086259, + "loss": 2.9353, + "theoretical_loss": 3.785408867979203, + "tokens_seen": 689248256 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003995687061183551, + "loss": 2.8747, + "theoretical_loss": 3.785372222924652, + "tokens_seen": 689313792 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039955867602808426, + "loss": 2.9572, + "theoretical_loss": 3.7853355823293517, + "tokens_seen": 689379328 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039954864593781344, + "loss": 3.0005, + "theoretical_loss": 3.785298946192336, + "tokens_seen": 689444864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003995386158475426, + "loss": 3.0287, + "theoretical_loss": 3.7852623145126376, + "tokens_seen": 689510400 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003995285857572718, + "loss": 3.0798, + "theoretical_loss": 3.7852256872892918, + "tokens_seen": 689575936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039951855566700104, + "loss": 3.0431, + "theoretical_loss": 3.785189064521332, + "tokens_seen": 689641472 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039950852557673017, + "loss": 3.1394, + "theoretical_loss": 3.785152446207794, + "tokens_seen": 689707008 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1118568, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0945322513580322, + "objective/train/theoretical_loss": 3.7851341387211317, + "objective/train/tokens_used": 710199776, + "theoretical_loss": 3.7851341387211317, + "tokens_seen": 689739776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003994984954864594, + "loss": 3.079, + "theoretical_loss": 3.785115832347712, + "tokens_seen": 689772544 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039948846539618853, + "loss": 2.8255, + "theoretical_loss": 3.785079222940122, + "tokens_seen": 689838080 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039947843530591776, + "loss": 2.917, + "theoretical_loss": 3.7850426179840584, + "tokens_seen": 689903616 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039946840521564694, + "loss": 2.9874, + "theoretical_loss": 3.7850060174785582, + "tokens_seen": 689969152 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003994583751253761, + "loss": 2.9839, + "theoretical_loss": 3.784969421422658, + "tokens_seen": 690034688 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003994483450351053, + "loss": 3.0232, + "theoretical_loss": 3.784932829815393, + "tokens_seen": 690100224 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003994383149448345, + "loss": 3.0049, + "theoretical_loss": 3.784896242655801, + "tokens_seen": 690165760 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039942828485456367, + "loss": 3.1297, + "theoretical_loss": 3.7848596599429185, + "tokens_seen": 690231296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003994182547642929, + "loss": 3.1141, + "theoretical_loss": 3.7848230816757837, + "tokens_seen": 690296832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039940822467402203, + "loss": 3.0945, + "theoretical_loss": 3.7847865078534335, + "tokens_seen": 690362368 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039939819458375127, + "loss": 3.0814, + "theoretical_loss": 3.7847499384749073, + "tokens_seen": 690427904 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003993881644934805, + "loss": 3.2057, + "theoretical_loss": 3.7847133735392413, + "tokens_seen": 690493440 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039937813440320963, + "loss": 2.8865, + "theoretical_loss": 3.784676813045476, + "tokens_seen": 690558976 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039936810431293886, + "loss": 3.083, + "theoretical_loss": 3.78464025699265, + "tokens_seen": 690624512 + }, + { + "epoch": 2.0, + "learning_rate": 0.000399358074222668, + "loss": 3.0143, + "theoretical_loss": 3.784603705379801, + "tokens_seen": 690690048 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003993480441323972, + "loss": 2.9061, + "theoretical_loss": 3.7845671582059706, + "tokens_seen": 690755584 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003993380140421264, + "loss": 2.9693, + "theoretical_loss": 3.784530615470197, + "tokens_seen": 690821120 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003993279839518556, + "loss": 2.9299, + "theoretical_loss": 3.7844940771715216, + "tokens_seen": 690886656 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039931795386158477, + "loss": 3.0038, + "theoretical_loss": 3.784457543308984, + "tokens_seen": 690952192 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039930792377131395, + "loss": 2.9987, + "theoretical_loss": 3.7844210138816257, + "tokens_seen": 691017728 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039929789368104313, + "loss": 2.9491, + "theoretical_loss": 3.7843844888884863, + "tokens_seen": 691083264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039928786359077237, + "loss": 3.0755, + "theoretical_loss": 3.7843479683286088, + "tokens_seen": 691148800 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003992778335005015, + "loss": 2.9555, + "theoretical_loss": 3.7843114522010337, + "tokens_seen": 691214336 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039926780341023073, + "loss": 2.993, + "theoretical_loss": 3.784274940504803, + "tokens_seen": 691279872 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039925777331995986, + "loss": 3.2307, + "theoretical_loss": 3.7842384332389596, + "tokens_seen": 691345408 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1121425, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.085228681564331, + "objective/train/theoretical_loss": 3.7842201812671337, + "objective/train/tokens_used": 711838176, + "theoretical_loss": 3.7842201812671337, + "tokens_seen": 691378176 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003992477432296891, + "loss": 3.1131, + "theoretical_loss": 3.784201930402545, + "tokens_seen": 691410944 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039923771313941827, + "loss": 2.899, + "theoretical_loss": 3.7841654319946034, + "tokens_seen": 691476480 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039922768304914745, + "loss": 3.1718, + "theoretical_loss": 3.7841289380141765, + "tokens_seen": 691542016 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039921765295887663, + "loss": 2.9519, + "theoretical_loss": 3.7840924484603082, + "tokens_seen": 691607552 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039920762286860587, + "loss": 2.9233, + "theoretical_loss": 3.784055963332042, + "tokens_seen": 691673088 + }, + { + "epoch": 2.0, + "learning_rate": 0.000399197592778335, + "loss": 2.9613, + "theoretical_loss": 3.7840194826284232, + "tokens_seen": 691738624 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039918756268806423, + "loss": 3.0866, + "theoretical_loss": 3.7839830063484943, + "tokens_seen": 691804160 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039917753259779336, + "loss": 2.9686, + "theoretical_loss": 3.7839465344913004, + "tokens_seen": 691869696 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003991675025075226, + "loss": 2.998, + "theoretical_loss": 3.7839100670558867, + "tokens_seen": 691935232 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003991574724172518, + "loss": 2.7931, + "theoretical_loss": 3.7838736040412986, + "tokens_seen": 692000768 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039914744232698096, + "loss": 2.9947, + "theoretical_loss": 3.7838371454465816, + "tokens_seen": 692066304 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039913741223671014, + "loss": 2.9839, + "theoretical_loss": 3.783800691270781, + "tokens_seen": 692131840 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003991273821464393, + "loss": 3.0701, + "theoretical_loss": 3.7837642415129427, + "tokens_seen": 692197376 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003991173520561685, + "loss": 3.04, + "theoretical_loss": 3.7837277961721134, + "tokens_seen": 692262912 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039910732196589774, + "loss": 3.1435, + "theoretical_loss": 3.78369135524734, + "tokens_seen": 692328448 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039909729187562686, + "loss": 2.9338, + "theoretical_loss": 3.7836549187376693, + "tokens_seen": 692393984 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003990872617853561, + "loss": 2.9945, + "theoretical_loss": 3.7836184866421485, + "tokens_seen": 692459520 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003990772316950853, + "loss": 2.8714, + "theoretical_loss": 3.7835820589598246, + "tokens_seen": 692525056 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039906720160481446, + "loss": 3.0018, + "theoretical_loss": 3.7835456356897463, + "tokens_seen": 692590592 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039905717151454364, + "loss": 2.824, + "theoretical_loss": 3.7835092168309616, + "tokens_seen": 692656128 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003990471414242728, + "loss": 2.8435, + "theoretical_loss": 3.7834728023825184, + "tokens_seen": 692721664 + }, + { + "epoch": 2.0, + "learning_rate": 0.000399037111334002, + "loss": 3.136, + "theoretical_loss": 3.783436392343466, + "tokens_seen": 692787200 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039902708124373124, + "loss": 2.9604, + "theoretical_loss": 3.783399986712854, + "tokens_seen": 692852736 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039901705115346037, + "loss": 2.968, + "theoretical_loss": 3.78336358548973, + "tokens_seen": 692918272 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003990070210631896, + "loss": 3.2015, + "theoretical_loss": 3.7833271886731445, + "tokens_seen": 692983808 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1124400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8312175273895264, + "objective/train/theoretical_loss": 3.783308991917007, + "objective/train/tokens_used": 713476576, + "theoretical_loss": 3.783308991917007, + "tokens_seen": 693016576 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039899699097291873, + "loss": 2.9998, + "theoretical_loss": 3.783290796262148, + "tokens_seen": 693049344 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039898696088264796, + "loss": 2.9154, + "theoretical_loss": 3.7832544082557904, + "tokens_seen": 693114880 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039897693079237714, + "loss": 3.0349, + "theoretical_loss": 3.783218024653121, + "tokens_seen": 693180416 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003989669007021063, + "loss": 2.9058, + "theoretical_loss": 3.7831816454531926, + "tokens_seen": 693245952 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003989568706118355, + "loss": 2.9408, + "theoretical_loss": 3.783145270655055, + "tokens_seen": 693311488 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003989468405215647, + "loss": 2.9563, + "theoretical_loss": 3.7831089002577594, + "tokens_seen": 693377024 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039893681043129387, + "loss": 2.9825, + "theoretical_loss": 3.7830725342603584, + "tokens_seen": 693442560 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003989267803410231, + "loss": 2.9927, + "theoretical_loss": 3.7830361726619035, + "tokens_seen": 693508096 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039891675025075223, + "loss": 2.936, + "theoretical_loss": 3.782999815461447, + "tokens_seen": 693573632 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039890672016048147, + "loss": 2.9709, + "theoretical_loss": 3.7829634626580413, + "tokens_seen": 693639168 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039889669007021065, + "loss": 2.956, + "theoretical_loss": 3.78292711425074, + "tokens_seen": 693704704 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039888665997993983, + "loss": 2.8688, + "theoretical_loss": 3.7828907702385948, + "tokens_seen": 693770240 + }, + { + "epoch": 2.0, + "learning_rate": 0.000398876629889669, + "loss": 3.0846, + "theoretical_loss": 3.7828544306206604, + "tokens_seen": 693835776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003988665997993982, + "loss": 2.889, + "theoretical_loss": 3.7828180953959905, + "tokens_seen": 693901312 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039885656970912737, + "loss": 3.0422, + "theoretical_loss": 3.782781764563638, + "tokens_seen": 693966848 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003988465396188566, + "loss": 2.7853, + "theoretical_loss": 3.7827454381226584, + "tokens_seen": 694032384 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039883650952858573, + "loss": 3.1144, + "theoretical_loss": 3.7827091160721062, + "tokens_seen": 694097920 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039882647943831497, + "loss": 2.9581, + "theoretical_loss": 3.7826727984110358, + "tokens_seen": 694163456 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003988164493480441, + "loss": 3.2088, + "theoretical_loss": 3.782636485138503, + "tokens_seen": 694228992 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039880641925777333, + "loss": 2.9004, + "theoretical_loss": 3.782600176253562, + "tokens_seen": 694294528 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003987963891675025, + "loss": 3.1157, + "theoretical_loss": 3.78256387175527, + "tokens_seen": 694360064 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003987863590772317, + "loss": 3.0521, + "theoretical_loss": 3.7825275716426825, + "tokens_seen": 694425600 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003987763289869609, + "loss": 2.8437, + "theoretical_loss": 3.7824912759148557, + "tokens_seen": 694491136 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039876629889669006, + "loss": 3.0146, + "theoretical_loss": 3.7824549845708466, + "tokens_seen": 694556672 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039875626880641924, + "loss": 3.0042, + "theoretical_loss": 3.782418697609712, + "tokens_seen": 694622208 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1127334, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.086352825164795, + "objective/train/theoretical_loss": 3.782400555772428, + "objective/train/tokens_used": 715114976, + "theoretical_loss": 3.782400555772428, + "tokens_seen": 694654976 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039874623871614847, + "loss": 2.8481, + "theoretical_loss": 3.7823824150305096, + "tokens_seen": 694687744 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003987362086258776, + "loss": 3.0149, + "theoretical_loss": 3.782346136832296, + "tokens_seen": 694753280 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039872617853560683, + "loss": 3.0678, + "theoretical_loss": 3.78230986301413, + "tokens_seen": 694818816 + }, + { + "epoch": 2.0, + "learning_rate": 0.000398716148445336, + "loss": 2.8682, + "theoretical_loss": 3.782273593575069, + "tokens_seen": 694884352 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003987061183550652, + "loss": 3.0123, + "theoretical_loss": 3.782237328514172, + "tokens_seen": 694949888 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003986960882647944, + "loss": 2.9868, + "theoretical_loss": 3.7822010678304974, + "tokens_seen": 695015424 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039868605817452356, + "loss": 3.0318, + "theoretical_loss": 3.7821648115231037, + "tokens_seen": 695080960 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039867602808425274, + "loss": 2.997, + "theoretical_loss": 3.782128559591051, + "tokens_seen": 695146496 + }, + { + "epoch": 2.0, + "learning_rate": 0.000398665997993982, + "loss": 2.9473, + "theoretical_loss": 3.7820923120333987, + "tokens_seen": 695212032 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003986559679037111, + "loss": 3.0356, + "theoretical_loss": 3.782056068849206, + "tokens_seen": 695277568 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039864593781344034, + "loss": 2.9742, + "theoretical_loss": 3.782019830037534, + "tokens_seen": 695343104 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003986359077231695, + "loss": 2.9323, + "theoretical_loss": 3.781983595597443, + "tokens_seen": 695408640 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003986258776328987, + "loss": 3.0605, + "theoretical_loss": 3.7819473655279925, + "tokens_seen": 695474176 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039861584754262794, + "loss": 2.8774, + "theoretical_loss": 3.781911139828245, + "tokens_seen": 695539712 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039860581745235706, + "loss": 2.9249, + "theoretical_loss": 3.7818749184972615, + "tokens_seen": 695605248 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003985957873620863, + "loss": 3.1183, + "theoretical_loss": 3.7818387015341033, + "tokens_seen": 695670784 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003985857572718155, + "loss": 2.8699, + "theoretical_loss": 3.7818024889378323, + "tokens_seen": 695736320 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039857572718154466, + "loss": 2.9249, + "theoretical_loss": 3.7817662807075108, + "tokens_seen": 695801856 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039856569709127384, + "loss": 3.0184, + "theoretical_loss": 3.781730076842201, + "tokens_seen": 695867392 + }, + { + "epoch": 2.0, + "learning_rate": 0.000398555667001003, + "loss": 3.1053, + "theoretical_loss": 3.7816938773409663, + "tokens_seen": 695932928 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003985456369107322, + "loss": 3.096, + "theoretical_loss": 3.7816576822028694, + "tokens_seen": 695998464 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039853560682046144, + "loss": 3.0403, + "theoretical_loss": 3.7816214914269732, + "tokens_seen": 696064000 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039852557673019057, + "loss": 2.8725, + "theoretical_loss": 3.7815853050123422, + "tokens_seen": 696129536 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003985155466399198, + "loss": 3.0624, + "theoretical_loss": 3.78154912295804, + "tokens_seen": 696195072 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039850551654964893, + "loss": 3.0268, + "theoretical_loss": 3.78151294526313, + "tokens_seen": 696260608 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1130214, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1926960945129395, + "objective/train/theoretical_loss": 3.7814948580501557, + "objective/train/tokens_used": 716753376, + "theoretical_loss": 3.7814948580501557, + "tokens_seen": 696293376 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039849548645937816, + "loss": 3.0929, + "theoretical_loss": 3.7814767719266777, + "tokens_seen": 696326144 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039848545636910734, + "loss": 3.1673, + "theoretical_loss": 3.7814406029477485, + "tokens_seen": 696391680 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003984754262788365, + "loss": 2.9897, + "theoretical_loss": 3.7814044383254055, + "tokens_seen": 696457216 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003984653961885657, + "loss": 2.8212, + "theoretical_loss": 3.7813682780587157, + "tokens_seen": 696522752 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003984553660982949, + "loss": 2.996, + "theoretical_loss": 3.781332122146744, + "tokens_seen": 696588288 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039844533600802407, + "loss": 3.0566, + "theoretical_loss": 3.7812959705885563, + "tokens_seen": 696653824 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003984353059177533, + "loss": 2.9729, + "theoretical_loss": 3.7812598233832198, + "tokens_seen": 696719360 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039842527582748243, + "loss": 3.0762, + "theoretical_loss": 3.7812236805297994, + "tokens_seen": 696784896 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039841524573721167, + "loss": 2.9394, + "theoretical_loss": 3.7811875420273635, + "tokens_seen": 696850432 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039840521564694085, + "loss": 2.8887, + "theoretical_loss": 3.781151407874978, + "tokens_seen": 696915968 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039839518555667003, + "loss": 2.7821, + "theoretical_loss": 3.781115278071711, + "tokens_seen": 696981504 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003983851554663992, + "loss": 3.102, + "theoretical_loss": 3.7810791526166305, + "tokens_seen": 697047040 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003983751253761284, + "loss": 3.0743, + "theoretical_loss": 3.7810430315088035, + "tokens_seen": 697112576 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039836509528585757, + "loss": 2.8172, + "theoretical_loss": 3.781006914747299, + "tokens_seen": 697178112 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003983550651955868, + "loss": 3.0305, + "theoretical_loss": 3.7809708023311845, + "tokens_seen": 697243648 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039834503510531593, + "loss": 3.1128, + "theoretical_loss": 3.78093469425953, + "tokens_seen": 697309184 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039833500501504517, + "loss": 2.7125, + "theoretical_loss": 3.780898590531404, + "tokens_seen": 697374720 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003983249749247743, + "loss": 3.1554, + "theoretical_loss": 3.780862491145876, + "tokens_seen": 697440256 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039831494483450353, + "loss": 2.9723, + "theoretical_loss": 3.7808263961020154, + "tokens_seen": 697505792 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003983049147442327, + "loss": 3.1173, + "theoretical_loss": 3.7807903053988934, + "tokens_seen": 697571328 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982948846539619, + "loss": 3.0704, + "theoretical_loss": 3.7807542190355785, + "tokens_seen": 697636864 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982848545636911, + "loss": 3.0337, + "theoretical_loss": 3.7807181370111422, + "tokens_seen": 697702400 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039827482447342026, + "loss": 2.9521, + "theoretical_loss": 3.780682059324656, + "tokens_seen": 697767936 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039826479438314944, + "loss": 3.0753, + "theoretical_loss": 3.7806459859751893, + "tokens_seen": 697833472 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039825476429287867, + "loss": 2.9736, + "theoretical_loss": 3.7806099169618146, + "tokens_seen": 697899008 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1132035, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0500290393829346, + "objective/train/theoretical_loss": 3.780591884080872, + "objective/train/tokens_used": 718391776, + "theoretical_loss": 3.780591884080872, + "tokens_seen": 697931776 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982447342026078, + "loss": 3.0029, + "theoretical_loss": 3.7805738522836037, + "tokens_seen": 697964544 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039823470411233703, + "loss": 3.0065, + "theoretical_loss": 3.7805377919396284, + "tokens_seen": 698030080 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982246740220662, + "loss": 3.0419, + "theoretical_loss": 3.78050173592896, + "tokens_seen": 698095616 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982146439317954, + "loss": 3.0344, + "theoretical_loss": 3.7804656842506725, + "tokens_seen": 698161152 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003982046138415246, + "loss": 3.0026, + "theoretical_loss": 3.7804296369038384, + "tokens_seen": 698226688 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039819458375125376, + "loss": 3.1014, + "theoretical_loss": 3.7803935938875304, + "tokens_seen": 698292224 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039818455366098294, + "loss": 3.0101, + "theoretical_loss": 3.7803575552008217, + "tokens_seen": 698357760 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003981745235707122, + "loss": 3.0664, + "theoretical_loss": 3.7803215208427865, + "tokens_seen": 698423296 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003981644934804413, + "loss": 2.9467, + "theoretical_loss": 3.7802854908124983, + "tokens_seen": 698488832 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039815446339017054, + "loss": 2.954, + "theoretical_loss": 3.7802494651090317, + "tokens_seen": 698554368 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039814443329989966, + "loss": 3.026, + "theoretical_loss": 3.7802134437314616, + "tokens_seen": 698619904 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003981344032096289, + "loss": 2.9283, + "theoretical_loss": 3.780177426678862, + "tokens_seen": 698685440 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003981243731193581, + "loss": 2.9498, + "theoretical_loss": 3.780141413950309, + "tokens_seen": 698750976 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039811434302908726, + "loss": 2.8749, + "theoretical_loss": 3.7801054055448766, + "tokens_seen": 698816512 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039810431293881644, + "loss": 2.9659, + "theoretical_loss": 3.7800694014616423, + "tokens_seen": 698882048 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003980942828485457, + "loss": 3.0811, + "theoretical_loss": 3.78003340169968, + "tokens_seen": 698947584 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003980842527582748, + "loss": 2.8309, + "theoretical_loss": 3.779997406258068, + "tokens_seen": 699013120 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039807422266800404, + "loss": 3.0565, + "theoretical_loss": 3.7799614151358814, + "tokens_seen": 699078656 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039806419257773317, + "loss": 2.8683, + "theoretical_loss": 3.779925428332197, + "tokens_seen": 699144192 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003980541624874624, + "loss": 3.1145, + "theoretical_loss": 3.7798894458460928, + "tokens_seen": 699209728 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003980441323971916, + "loss": 3.0463, + "theoretical_loss": 3.7798534676766455, + "tokens_seen": 699275264 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039803410230692077, + "loss": 2.924, + "theoretical_loss": 3.779817493822933, + "tokens_seen": 699340800 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039802407221664995, + "loss": 3.1726, + "theoretical_loss": 3.779781524284034, + "tokens_seen": 699406336 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039801404212637913, + "loss": 2.9368, + "theoretical_loss": 3.779745559059025, + "tokens_seen": 699471872 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003980040120361083, + "loss": 3.1256, + "theoretical_loss": 3.7797095981469857, + "tokens_seen": 699537408 + }, + { + "epoch": 2.0, + "objective/train/docs_used": 1134916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1210145950317383, + "objective/train/theoretical_loss": 3.779691619308042, + "objective/train/tokens_used": 720030176, + "theoretical_loss": 3.779691619308042, + "tokens_seen": 699570176 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039799398194583754, + "loss": 3.0097, + "theoretical_loss": 3.779673641546995, + "tokens_seen": 699602944 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039798395185556667, + "loss": 3.0371, + "theoretical_loss": 3.7796376892581316, + "tokens_seen": 699668480 + }, + { + "epoch": 2.0, + "learning_rate": 0.0003979739217652959, + "loss": 3.0254, + "theoretical_loss": 3.7796017412794747, + "tokens_seen": 699734016 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039796389167502503, + "loss": 2.9508, + "theoretical_loss": 3.7795657976101045, + "tokens_seen": 699799552 + }, + { + "epoch": 2.0, + "learning_rate": 0.00039795386158475427, + "loss": 3.0203, + "theoretical_loss": 3.7795298582491004, + "tokens_seen": 699865088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039794383149448345, + "loss": 3.0236, + "theoretical_loss": 3.7794939231955427, + "tokens_seen": 699930624 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039793380140421263, + "loss": 2.7663, + "theoretical_loss": 3.7794579924485125, + "tokens_seen": 699996160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003979237713139418, + "loss": 2.8867, + "theoretical_loss": 3.77942206600709, + "tokens_seen": 700061696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039791374122367105, + "loss": 3.0038, + "theoretical_loss": 3.779386143870356, + "tokens_seen": 700127232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003979037111334002, + "loss": 2.9642, + "theoretical_loss": 3.7793502260373923, + "tokens_seen": 700192768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003978936810431294, + "loss": 3.035, + "theoretical_loss": 3.779314312507281, + "tokens_seen": 700258304 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003978836509528586, + "loss": 3.0223, + "theoretical_loss": 3.7792784032791027, + "tokens_seen": 700323840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039787362086258777, + "loss": 3.0125, + "theoretical_loss": 3.779242498351941, + "tokens_seen": 700389376 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397863590772317, + "loss": 2.9328, + "theoretical_loss": 3.779206597724877, + "tokens_seen": 700454912 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039785356068204613, + "loss": 3.072, + "theoretical_loss": 3.779170701396995, + "tokens_seen": 700520448 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039784353059177537, + "loss": 3.15, + "theoretical_loss": 3.7791348093673767, + "tokens_seen": 700585984 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003978335005015045, + "loss": 2.9555, + "theoretical_loss": 3.7790989216351054, + "tokens_seen": 700651520 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039782347041123373, + "loss": 2.9995, + "theoretical_loss": 3.779063038199266, + "tokens_seen": 700717056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003978134403209629, + "loss": 2.9782, + "theoretical_loss": 3.7790271590589413, + "tokens_seen": 700782592 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003978034102306921, + "loss": 2.7636, + "theoretical_loss": 3.7789912842132156, + "tokens_seen": 700848128 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003977933801404213, + "loss": 2.9519, + "theoretical_loss": 3.7789554136611736, + "tokens_seen": 700913664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039778335005015046, + "loss": 3.0843, + "theoretical_loss": 3.7789195474019, + "tokens_seen": 700979200 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039777331995987964, + "loss": 3.1213, + "theoretical_loss": 3.7788836854344794, + "tokens_seen": 701044736 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039776328986960887, + "loss": 3.0071, + "theoretical_loss": 3.7788478277579975, + "tokens_seen": 701110272 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397753259779338, + "loss": 3.0154, + "theoretical_loss": 3.77881197437154, + "tokens_seen": 701175808 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1137822, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.112853765487671, + "objective/train/theoretical_loss": 3.778794049286784, + "objective/train/tokens_used": 721668576, + "theoretical_loss": 3.778794049286784, + "tokens_seen": 701208576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039774322968906723, + "loss": 3.012, + "theoretical_loss": 3.778776125274192, + "tokens_seen": 701241344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003977331995987964, + "loss": 3.0283, + "theoretical_loss": 3.7787402804650405, + "tokens_seen": 701306880 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003977231695085256, + "loss": 3.0277, + "theoretical_loss": 3.778704439943171, + "tokens_seen": 701372416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003977131394182548, + "loss": 3.0614, + "theoretical_loss": 3.778668603707671, + "tokens_seen": 701437952 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039770310932798396, + "loss": 2.792, + "theoretical_loss": 3.7786327717576276, + "tokens_seen": 701503488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039769307923771314, + "loss": 3.0792, + "theoretical_loss": 3.7785969440921265, + "tokens_seen": 701569024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003976830491474424, + "loss": 3.0474, + "theoretical_loss": 3.778561120710257, + "tokens_seen": 701634560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003976730190571715, + "loss": 2.8767, + "theoretical_loss": 3.778525301611106, + "tokens_seen": 701700096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039766298896690074, + "loss": 2.955, + "theoretical_loss": 3.778489486793762, + "tokens_seen": 701765632 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039765295887662987, + "loss": 3.058, + "theoretical_loss": 3.778453676257313, + "tokens_seen": 701831168 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003976429287863591, + "loss": 2.9982, + "theoretical_loss": 3.778417870000848, + "tokens_seen": 701896704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003976328986960883, + "loss": 2.957, + "theoretical_loss": 3.7783820680234554, + "tokens_seen": 701962240 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039762286860581746, + "loss": 2.9948, + "theoretical_loss": 3.778346270324225, + "tokens_seen": 702027776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039761283851554664, + "loss": 3.013, + "theoretical_loss": 3.7783104769022455, + "tokens_seen": 702093312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003976028084252759, + "loss": 3.0831, + "theoretical_loss": 3.7782746877566074, + "tokens_seen": 702158848 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397592778335005, + "loss": 2.8625, + "theoretical_loss": 3.7782389028864003, + "tokens_seen": 702224384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039758274824473424, + "loss": 3.0658, + "theoretical_loss": 3.778203122290715, + "tokens_seen": 702289920 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039757271815446337, + "loss": 2.9709, + "theoretical_loss": 3.7781673459686416, + "tokens_seen": 702355456 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003975626880641926, + "loss": 3.0138, + "theoretical_loss": 3.7781315739192713, + "tokens_seen": 702420992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003975526579739218, + "loss": 2.938, + "theoretical_loss": 3.7780958061416947, + "tokens_seen": 702486528 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039754262788365097, + "loss": 2.9755, + "theoretical_loss": 3.778060042635004, + "tokens_seen": 702552064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039753259779338015, + "loss": 3.0604, + "theoretical_loss": 3.77802428339829, + "tokens_seen": 702617600 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039752256770310933, + "loss": 2.8289, + "theoretical_loss": 3.777988528430645, + "tokens_seen": 702683136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003975125376128385, + "loss": 2.9962, + "theoretical_loss": 3.777952777731162, + "tokens_seen": 702748672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039750250752256774, + "loss": 2.8595, + "theoretical_loss": 3.7779170312989327, + "tokens_seen": 702814208 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1140622, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2860770225524902, + "objective/train/theoretical_loss": 3.7778991596827547, + "objective/train/tokens_used": 723306976, + "theoretical_loss": 3.7778991596827547, + "tokens_seen": 702846976 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039749247743229687, + "loss": 3.1478, + "theoretical_loss": 3.77788128913305, + "tokens_seen": 702879744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003974824473420261, + "loss": 2.8618, + "theoretical_loss": 3.777845551232608, + "tokens_seen": 702945280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039747241725175523, + "loss": 2.9763, + "theoretical_loss": 3.7778098175966983, + "tokens_seen": 703010816 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039746238716148447, + "loss": 3.1218, + "theoretical_loss": 3.777774088224416, + "tokens_seen": 703076352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039745235707121365, + "loss": 2.9473, + "theoretical_loss": 3.7777383631148544, + "tokens_seen": 703141888 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039744232698094283, + "loss": 3.0865, + "theoretical_loss": 3.777702642267108, + "tokens_seen": 703207424 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397432296890672, + "loss": 2.995, + "theoretical_loss": 3.777666925680271, + "tokens_seen": 703272960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039742226680040125, + "loss": 2.993, + "theoretical_loss": 3.7776312133534375, + "tokens_seen": 703338496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003974122367101304, + "loss": 2.9532, + "theoretical_loss": 3.7775955052857038, + "tokens_seen": 703404032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003974022066198596, + "loss": 3.0307, + "theoretical_loss": 3.777559801476165, + "tokens_seen": 703469568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039739217652958874, + "loss": 3.0738, + "theoretical_loss": 3.777524101923916, + "tokens_seen": 703535104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039738214643931797, + "loss": 2.9955, + "theoretical_loss": 3.777488406628053, + "tokens_seen": 703600640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039737211634904715, + "loss": 2.9607, + "theoretical_loss": 3.7774527155876725, + "tokens_seen": 703666176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039736208625877633, + "loss": 3.1088, + "theoretical_loss": 3.77741702880187, + "tokens_seen": 703731712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003973520561685055, + "loss": 2.9482, + "theoretical_loss": 3.777381346269743, + "tokens_seen": 703797248 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003973420260782347, + "loss": 2.9526, + "theoretical_loss": 3.7773456679903887, + "tokens_seen": 703862784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003973319959879639, + "loss": 3.052, + "theoretical_loss": 3.777309993962903, + "tokens_seen": 703928320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003973219658976931, + "loss": 3.1575, + "theoretical_loss": 3.7772743241863846, + "tokens_seen": 703993856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039731193580742224, + "loss": 2.942, + "theoretical_loss": 3.777238658659931, + "tokens_seen": 704059392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003973019057171515, + "loss": 3.0734, + "theoretical_loss": 3.7772029973826404, + "tokens_seen": 704124928 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003972918756268806, + "loss": 3.1307, + "theoretical_loss": 3.7771673403536106, + "tokens_seen": 704190464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039728184553660984, + "loss": 2.8693, + "theoretical_loss": 3.777131687571941, + "tokens_seen": 704256000 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397271815446339, + "loss": 3.044, + "theoretical_loss": 3.7770960390367296, + "tokens_seen": 704321536 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003972617853560682, + "loss": 2.9128, + "theoretical_loss": 3.7770603947470764, + "tokens_seen": 704387072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003972517552657974, + "loss": 2.9582, + "theoretical_loss": 3.7770247547020803, + "tokens_seen": 704452608 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1142051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9800336360931396, + "objective/train/theoretical_loss": 3.7770069362710474, + "objective/train/tokens_used": 724945376, + "theoretical_loss": 3.7770069362710474, + "tokens_seen": 704485376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003972417251755266, + "loss": 3.005, + "theoretical_loss": 3.7769891189008407, + "tokens_seen": 704518144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039723169508525574, + "loss": 3.0725, + "theoretical_loss": 3.7769534873424586, + "tokens_seen": 704583680 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397221664994985, + "loss": 3.0813, + "theoretical_loss": 3.7769178600260336, + "tokens_seen": 704649216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003972116349047141, + "loss": 3.0256, + "theoretical_loss": 3.7768822369506667, + "tokens_seen": 704714752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039720160481444334, + "loss": 3.0458, + "theoretical_loss": 3.7768466181154583, + "tokens_seen": 704780288 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971915747241725, + "loss": 2.9379, + "theoretical_loss": 3.776811003519509, + "tokens_seen": 704845824 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971815446339017, + "loss": 3.0863, + "theoretical_loss": 3.7767753931619215, + "tokens_seen": 704911360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971715145436309, + "loss": 3.0192, + "theoretical_loss": 3.7767397870417962, + "tokens_seen": 704976896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039716148445336007, + "loss": 2.927, + "theoretical_loss": 3.7767041851582355, + "tokens_seen": 705042432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039715145436308925, + "loss": 3.115, + "theoretical_loss": 3.7766685875103416, + "tokens_seen": 705107968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971414242728185, + "loss": 3.0825, + "theoretical_loss": 3.776632994097217, + "tokens_seen": 705173504 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039713139418254766, + "loss": 3.1039, + "theoretical_loss": 3.7765974049179647, + "tokens_seen": 705239040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039712136409227684, + "loss": 2.9621, + "theoretical_loss": 3.7765618199716866, + "tokens_seen": 705304576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971113340020061, + "loss": 3.1001, + "theoretical_loss": 3.776526239257487, + "tokens_seen": 705370112 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003971013039117352, + "loss": 2.9694, + "theoretical_loss": 3.7764906627744694, + "tokens_seen": 705435648 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039709127382146444, + "loss": 3.2438, + "theoretical_loss": 3.7764550905217376, + "tokens_seen": 705501184 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039708124373119357, + "loss": 2.9646, + "theoretical_loss": 3.776419522498395, + "tokens_seen": 705566720 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003970712136409228, + "loss": 2.7774, + "theoretical_loss": 3.776383958703547, + "tokens_seen": 705632256 + }, + { + "epoch": 2.01, + "learning_rate": 0.000397061183550652, + "loss": 3.0742, + "theoretical_loss": 3.776348399136297, + "tokens_seen": 705697792 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039705115346038117, + "loss": 3.1148, + "theoretical_loss": 3.7763128437957514, + "tokens_seen": 705763328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039704112337011035, + "loss": 2.9025, + "theoretical_loss": 3.776277292681014, + "tokens_seen": 705828864 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039703109327983953, + "loss": 3.0582, + "theoretical_loss": 3.776241745791191, + "tokens_seen": 705894400 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003970210631895687, + "loss": 2.9788, + "theoretical_loss": 3.7762062031253887, + "tokens_seen": 705959936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039701103309929794, + "loss": 3.1541, + "theoretical_loss": 3.776170664682712, + "tokens_seen": 706025472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039700100300902707, + "loss": 2.9948, + "theoretical_loss": 3.7761351304622677, + "tokens_seen": 706091008 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1144616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.277618169784546, + "objective/train/theoretical_loss": 3.776117364935103, + "objective/train/tokens_used": 726583776, + "theoretical_loss": 3.776117364935103, + "tokens_seen": 706123776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003969909729187563, + "loss": 3.1073, + "theoretical_loss": 3.776099600463162, + "tokens_seen": 706156544 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039698094282848543, + "loss": 2.9309, + "theoretical_loss": 3.776064074684502, + "tokens_seen": 706222080 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039697091273821467, + "loss": 2.9937, + "theoretical_loss": 3.776028553125395, + "tokens_seen": 706287616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039696088264794385, + "loss": 3.0137, + "theoretical_loss": 3.775993035784948, + "tokens_seen": 706353152 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039695085255767303, + "loss": 2.9947, + "theoretical_loss": 3.775957522662269, + "tokens_seen": 706418688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003969408224674022, + "loss": 3.0318, + "theoretical_loss": 3.7759220137564653, + "tokens_seen": 706484224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039693079237713145, + "loss": 3.1186, + "theoretical_loss": 3.7758865090666456, + "tokens_seen": 706549760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003969207622868606, + "loss": 3.15, + "theoretical_loss": 3.775851008591918, + "tokens_seen": 706615296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003969107321965898, + "loss": 3.0676, + "theoretical_loss": 3.7758155123313917, + "tokens_seen": 706680832 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039690070210631894, + "loss": 2.7829, + "theoretical_loss": 3.7757800202841754, + "tokens_seen": 706746368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039689067201604817, + "loss": 3.1311, + "theoretical_loss": 3.7757445324493784, + "tokens_seen": 706811904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039688064192577735, + "loss": 2.9669, + "theoretical_loss": 3.7757090488261102, + "tokens_seen": 706877440 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039687061183550653, + "loss": 3.0485, + "theoretical_loss": 3.775673569413481, + "tokens_seen": 706942976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968605817452357, + "loss": 3.0962, + "theoretical_loss": 3.7756380942106, + "tokens_seen": 707008512 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968505516549649, + "loss": 3.1243, + "theoretical_loss": 3.775602623216578, + "tokens_seen": 707074048 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968405215646941, + "loss": 3.0844, + "theoretical_loss": 3.775567156430526, + "tokens_seen": 707139584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968304914744233, + "loss": 2.9422, + "theoretical_loss": 3.775531693851554, + "tokens_seen": 707205120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039682046138415244, + "loss": 3.108, + "theoretical_loss": 3.7754962354787747, + "tokens_seen": 707270656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968104312938817, + "loss": 3.1274, + "theoretical_loss": 3.7754607813112977, + "tokens_seen": 707336192 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003968004012036108, + "loss": 2.9938, + "theoretical_loss": 3.7754253313482358, + "tokens_seen": 707401728 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039679037111334004, + "loss": 2.9876, + "theoretical_loss": 3.7753898855887007, + "tokens_seen": 707467264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967803410230692, + "loss": 3.0968, + "theoretical_loss": 3.7753544440318048, + "tokens_seen": 707532800 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967703109327984, + "loss": 2.9856, + "theoretical_loss": 3.77531900667666, + "tokens_seen": 707598336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967602808425276, + "loss": 3.1437, + "theoretical_loss": 3.77528357352238, + "tokens_seen": 707663872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967502507522568, + "loss": 2.9961, + "theoretical_loss": 3.7752481445680774, + "tokens_seen": 707729408 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1147397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.929410219192505, + "objective/train/theoretical_loss": 3.7752304316656407, + "objective/train/tokens_used": 728222176, + "theoretical_loss": 3.7752304316656407, + "tokens_seen": 707762176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039674022066198594, + "loss": 3.021, + "theoretical_loss": 3.7752127198128655, + "tokens_seen": 707794944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967301905717152, + "loss": 3.1974, + "theoretical_loss": 3.7751772992558577, + "tokens_seen": 707860480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967201604814443, + "loss": 3.1144, + "theoretical_loss": 3.7751418828961683, + "tokens_seen": 707926016 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039671013039117354, + "loss": 2.774, + "theoretical_loss": 3.775106470732911, + "tokens_seen": 707991552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003967001003009027, + "loss": 3.1101, + "theoretical_loss": 3.7750710627652, + "tokens_seen": 708057088 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003966900702106319, + "loss": 2.9276, + "theoretical_loss": 3.7750356589921505, + "tokens_seen": 708122624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003966800401203611, + "loss": 3.0518, + "theoretical_loss": 3.775000259412878, + "tokens_seen": 708188160 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039667001003009027, + "loss": 2.8906, + "theoretical_loss": 3.7749648640264963, + "tokens_seen": 708253696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039665997993981945, + "loss": 2.8831, + "theoretical_loss": 3.774929472832121, + "tokens_seen": 708319232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003966499498495487, + "loss": 2.9714, + "theoretical_loss": 3.7748940858288695, + "tokens_seen": 708384768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003966399197592778, + "loss": 3.0363, + "theoretical_loss": 3.774858703015856, + "tokens_seen": 708450304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039662988966900704, + "loss": 2.9379, + "theoretical_loss": 3.7748233243921976, + "tokens_seen": 708515840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039661985957873617, + "loss": 3.1245, + "theoretical_loss": 3.7747879499570107, + "tokens_seen": 708581376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003966098294884654, + "loss": 2.8607, + "theoretical_loss": 3.7747525797094115, + "tokens_seen": 708646912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965997993981946, + "loss": 3.0793, + "theoretical_loss": 3.7747172136485183, + "tokens_seen": 708712448 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039658976930792377, + "loss": 3.1304, + "theoretical_loss": 3.7746818517734475, + "tokens_seen": 708777984 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039657973921765295, + "loss": 2.8869, + "theoretical_loss": 3.774646494083317, + "tokens_seen": 708843520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965697091273822, + "loss": 2.9442, + "theoretical_loss": 3.774611140577245, + "tokens_seen": 708909056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965596790371113, + "loss": 3.0579, + "theoretical_loss": 3.774575791254349, + "tokens_seen": 708974592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039654964894684055, + "loss": 2.9822, + "theoretical_loss": 3.7745404461137477, + "tokens_seen": 709040128 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965396188565697, + "loss": 2.9757, + "theoretical_loss": 3.77450510515456, + "tokens_seen": 709105664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965295887662989, + "loss": 2.991, + "theoretical_loss": 3.774469768375905, + "tokens_seen": 709171200 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003965195586760281, + "loss": 2.9781, + "theoretical_loss": 3.774434435776901, + "tokens_seen": 709236736 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039650952858575727, + "loss": 2.9795, + "theoretical_loss": 3.7743991073566687, + "tokens_seen": 709302272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039649949849548645, + "loss": 3.0186, + "theoretical_loss": 3.774363783114327, + "tokens_seen": 709367808 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1150117, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2173540592193604, + "objective/train/theoretical_loss": 3.77434612255959, + "objective/train/tokens_used": 729860576, + "theoretical_loss": 3.77434612255959, + "tokens_seen": 709400576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039648946840521563, + "loss": 2.9652, + "theoretical_loss": 3.774328463048996, + "tokens_seen": 709433344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003964794383149448, + "loss": 2.9781, + "theoretical_loss": 3.774293147159796, + "tokens_seen": 709498880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039646940822467405, + "loss": 3.0392, + "theoretical_loss": 3.774257835445848, + "tokens_seen": 709564416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003964593781344032, + "loss": 2.8995, + "theoretical_loss": 3.7742225279062724, + "tokens_seen": 709629952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003964493480441324, + "loss": 2.9164, + "theoretical_loss": 3.7741872245401904, + "tokens_seen": 709695488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003964393179538616, + "loss": 2.8631, + "theoretical_loss": 3.7741519253467226, + "tokens_seen": 709761024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003964292878635908, + "loss": 2.8852, + "theoretical_loss": 3.7741166303249924, + "tokens_seen": 709826560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039641925777331996, + "loss": 3.035, + "theoretical_loss": 3.77408133947412, + "tokens_seen": 709892096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039640922768304914, + "loss": 2.9518, + "theoretical_loss": 3.7740460527932282, + "tokens_seen": 709957632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963991975927783, + "loss": 3.0133, + "theoretical_loss": 3.77401077028144, + "tokens_seen": 710023168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039638916750250755, + "loss": 3.103, + "theoretical_loss": 3.7739754919378767, + "tokens_seen": 710088704 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039637913741223673, + "loss": 3.0586, + "theoretical_loss": 3.773940217761662, + "tokens_seen": 710154240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963691073219659, + "loss": 2.9759, + "theoretical_loss": 3.7739049477519195, + "tokens_seen": 710219776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963590772316951, + "loss": 3.1464, + "theoretical_loss": 3.7738696819077724, + "tokens_seen": 710285312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963490471414243, + "loss": 2.8777, + "theoretical_loss": 3.7738344202283436, + "tokens_seen": 710350848 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963390170511535, + "loss": 2.8875, + "theoretical_loss": 3.7737991627127587, + "tokens_seen": 710416384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039632898696088264, + "loss": 2.9757, + "theoretical_loss": 3.7737639093601407, + "tokens_seen": 710481920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003963189568706119, + "loss": 2.7921, + "theoretical_loss": 3.7737286601696143, + "tokens_seen": 710547456 + }, + { + "epoch": 2.01, + "learning_rate": 0.000396308926780341, + "loss": 2.9891, + "theoretical_loss": 3.773693415140305, + "tokens_seen": 710612992 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039629889669007024, + "loss": 3.0746, + "theoretical_loss": 3.7736581742713367, + "tokens_seen": 710678528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962888665997994, + "loss": 3.0382, + "theoretical_loss": 3.7736229375618358, + "tokens_seen": 710744064 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962788365095286, + "loss": 2.9244, + "theoretical_loss": 3.773587705010928, + "tokens_seen": 710809600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962688064192578, + "loss": 3.0097, + "theoretical_loss": 3.7735524766177377, + "tokens_seen": 710875136 + }, + { + "epoch": 2.01, + "learning_rate": 0.000396258776328987, + "loss": 2.9521, + "theoretical_loss": 3.7735172523813927, + "tokens_seen": 710940672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039624874623871614, + "loss": 3.0219, + "theoretical_loss": 3.7734820323010183, + "tokens_seen": 711006208 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1152743, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.030127763748169, + "objective/train/theoretical_loss": 3.773464423819047, + "objective/train/tokens_used": 731498976, + "theoretical_loss": 3.773464423819047, + "tokens_seen": 711038976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962387161484454, + "loss": 2.9408, + "theoretical_loss": 3.773446816375742, + "tokens_seen": 711071744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962286860581745, + "loss": 2.9331, + "theoretical_loss": 3.7734116046046897, + "tokens_seen": 711137280 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039621865596790374, + "loss": 3.0302, + "theoretical_loss": 3.7733763969869893, + "tokens_seen": 711202816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003962086258776329, + "loss": 3.0211, + "theoretical_loss": 3.7733411935217678, + "tokens_seen": 711268352 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003961985957873621, + "loss": 3.0872, + "theoretical_loss": 3.7733059942081537, + "tokens_seen": 711333888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003961885656970913, + "loss": 2.9653, + "theoretical_loss": 3.7732707990452745, + "tokens_seen": 711399424 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039617853560682047, + "loss": 2.9092, + "theoretical_loss": 3.773235608032258, + "tokens_seen": 711464960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039616850551654965, + "loss": 3.0604, + "theoretical_loss": 3.773200421168233, + "tokens_seen": 711530496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003961584754262789, + "loss": 3.0327, + "theoretical_loss": 3.773165238452328, + "tokens_seen": 711596032 + }, + { + "epoch": 2.01, + "learning_rate": 0.000396148445336008, + "loss": 3.0547, + "theoretical_loss": 3.7731300598836732, + "tokens_seen": 711661568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039613841524573724, + "loss": 2.9622, + "theoretical_loss": 3.773094885461396, + "tokens_seen": 711727104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039612838515546637, + "loss": 2.866, + "theoretical_loss": 3.773059715184628, + "tokens_seen": 711792640 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003961183550651956, + "loss": 2.977, + "theoretical_loss": 3.7730245490524976, + "tokens_seen": 711858176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003961083249749248, + "loss": 3.1074, + "theoretical_loss": 3.7729893870641353, + "tokens_seen": 711923712 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039609829488465397, + "loss": 3.0645, + "theoretical_loss": 3.7729542292186715, + "tokens_seen": 711989248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039608826479438315, + "loss": 2.9392, + "theoretical_loss": 3.7729190755152366, + "tokens_seen": 712054784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003960782347041124, + "loss": 2.9775, + "theoretical_loss": 3.772883925952961, + "tokens_seen": 712120320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003960682046138415, + "loss": 2.9557, + "theoretical_loss": 3.7728487805309774, + "tokens_seen": 712185856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039605817452357075, + "loss": 3.0365, + "theoretical_loss": 3.7728136392484153, + "tokens_seen": 712251392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003960481444332999, + "loss": 3.0041, + "theoretical_loss": 3.7727785021044076, + "tokens_seen": 712316928 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003960381143430291, + "loss": 2.9861, + "theoretical_loss": 3.772743369098086, + "tokens_seen": 712382464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003960280842527583, + "loss": 2.8913, + "theoretical_loss": 3.772708240228582, + "tokens_seen": 712448000 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039601805416248747, + "loss": 3.0544, + "theoretical_loss": 3.772673115495029, + "tokens_seen": 712513536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039600802407221665, + "loss": 2.9655, + "theoretical_loss": 3.7726379948965594, + "tokens_seen": 712579072 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039599799398194583, + "loss": 3.0678, + "theoretical_loss": 3.7726028784323056, + "tokens_seen": 712644608 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1155520, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1862857341766357, + "objective/train/theoretical_loss": 3.7725853217502383, + "objective/train/tokens_used": 733137376, + "theoretical_loss": 3.7725853217502383, + "tokens_seen": 712677376 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395987963891675, + "loss": 3.0185, + "theoretical_loss": 3.7725677661014005, + "tokens_seen": 712710144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039597793380140425, + "loss": 3.0946, + "theoretical_loss": 3.772532657902979, + "tokens_seen": 712775680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003959679037111334, + "loss": 2.7653, + "theoretical_loss": 3.7724975538361747, + "tokens_seen": 712841216 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003959578736208626, + "loss": 3.1089, + "theoretical_loss": 3.77246245390012, + "tokens_seen": 712906752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003959478435305918, + "loss": 3.0752, + "theoretical_loss": 3.7724273580939514, + "tokens_seen": 712972288 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395937813440321, + "loss": 3.0065, + "theoretical_loss": 3.772392266416801, + "tokens_seen": 713037824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039592778335005016, + "loss": 2.9624, + "theoretical_loss": 3.7723571788678054, + "tokens_seen": 713103360 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039591775325977934, + "loss": 2.8983, + "theoretical_loss": 3.772322095446099, + "tokens_seen": 713168896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003959077231695085, + "loss": 3.0269, + "theoretical_loss": 3.772287016150817, + "tokens_seen": 713234432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039589769307923775, + "loss": 2.9475, + "theoretical_loss": 3.7722519409810955, + "tokens_seen": 713299968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003958876629889669, + "loss": 2.9031, + "theoretical_loss": 3.77221686993607, + "tokens_seen": 713365504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003958776328986961, + "loss": 2.9134, + "theoretical_loss": 3.7721818030148757, + "tokens_seen": 713431040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039586760280842524, + "loss": 2.9965, + "theoretical_loss": 3.772146740216651, + "tokens_seen": 713496576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003958575727181545, + "loss": 2.8568, + "theoretical_loss": 3.7721116815405304, + "tokens_seen": 713562112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039584754262788366, + "loss": 2.9385, + "theoretical_loss": 3.772076626985652, + "tokens_seen": 713627648 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039583751253761284, + "loss": 3.0919, + "theoretical_loss": 3.7720415765511524, + "tokens_seen": 713693184 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395827482447342, + "loss": 3.0487, + "theoretical_loss": 3.7720065302361694, + "tokens_seen": 713758720 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003958174523570712, + "loss": 3.0481, + "theoretical_loss": 3.7719714880398403, + "tokens_seen": 713824256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003958074222668004, + "loss": 2.9958, + "theoretical_loss": 3.771936449961303, + "tokens_seen": 713889792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003957973921765296, + "loss": 2.9323, + "theoretical_loss": 3.7719014159996966, + "tokens_seen": 713955328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039578736208625875, + "loss": 2.8443, + "theoretical_loss": 3.7718663861541577, + "tokens_seen": 714020864 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395777331995988, + "loss": 2.8776, + "theoretical_loss": 3.7718313604238265, + "tokens_seen": 714086400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039576730190571716, + "loss": 3.0228, + "theoretical_loss": 3.771796338807842, + "tokens_seen": 714151936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039575727181544634, + "loss": 2.9201, + "theoretical_loss": 3.771761321305342, + "tokens_seen": 714217472 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003957472417251755, + "loss": 2.929, + "theoretical_loss": 3.7717263079154675, + "tokens_seen": 714283008 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1157062, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81109356880188, + "objective/train/theoretical_loss": 3.7717088027624954, + "objective/train/tokens_used": 734775776, + "theoretical_loss": 3.7717088027624954, + "tokens_seen": 714315776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003957372116349047, + "loss": 2.9178, + "theoretical_loss": 3.771691298637357, + "tokens_seen": 714348544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003957271815446339, + "loss": 2.9156, + "theoretical_loss": 3.7716562934701514, + "tokens_seen": 714414080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003957171514543631, + "loss": 2.9702, + "theoretical_loss": 3.7716212924129904, + "tokens_seen": 714479616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039570712136409225, + "loss": 2.9924, + "theoretical_loss": 3.7715862954650152, + "tokens_seen": 714545152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003956970912738215, + "loss": 3.0219, + "theoretical_loss": 3.771551302625366, + "tokens_seen": 714610688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003956870611835506, + "loss": 2.9811, + "theoretical_loss": 3.7715163138931835, + "tokens_seen": 714676224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039567703109327985, + "loss": 3.1006, + "theoretical_loss": 3.7714813292676093, + "tokens_seen": 714741760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039566700100300903, + "loss": 3.0094, + "theoretical_loss": 3.7714463487477854, + "tokens_seen": 714807296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003956569709127382, + "loss": 2.9318, + "theoretical_loss": 3.7714113723328526, + "tokens_seen": 714872832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003956469408224674, + "loss": 2.9376, + "theoretical_loss": 3.771376400021954, + "tokens_seen": 714938368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039563691073219657, + "loss": 2.8744, + "theoretical_loss": 3.771341431814231, + "tokens_seen": 715003904 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003956268806419258, + "loss": 2.961, + "theoretical_loss": 3.7713064677088273, + "tokens_seen": 715069440 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395616850551655, + "loss": 2.9553, + "theoretical_loss": 3.771271507704884, + "tokens_seen": 715134976 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039560682046138417, + "loss": 2.9081, + "theoretical_loss": 3.771236551801546, + "tokens_seen": 715200512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039559679037111335, + "loss": 3.0275, + "theoretical_loss": 3.771201599997956, + "tokens_seen": 715266048 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003955867602808426, + "loss": 2.9933, + "theoretical_loss": 3.7711666522932568, + "tokens_seen": 715331584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003955767301905717, + "loss": 3.1027, + "theoretical_loss": 3.7711317086865925, + "tokens_seen": 715397120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039556670010030095, + "loss": 2.9447, + "theoretical_loss": 3.7710967691771087, + "tokens_seen": 715462656 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003955566700100301, + "loss": 2.9054, + "theoretical_loss": 3.771061833763948, + "tokens_seen": 715528192 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003955466399197593, + "loss": 2.8627, + "theoretical_loss": 3.771026902446256, + "tokens_seen": 715593728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003955366098294885, + "loss": 2.9385, + "theoretical_loss": 3.7709919752231773, + "tokens_seen": 715659264 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039552657973921767, + "loss": 2.9977, + "theoretical_loss": 3.7709570520938565, + "tokens_seen": 715724800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039551654964894685, + "loss": 2.943, + "theoretical_loss": 3.77092213305744, + "tokens_seen": 715790336 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039550651955867603, + "loss": 2.9495, + "theoretical_loss": 3.7708872181130726, + "tokens_seen": 715855872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003954964894684052, + "loss": 2.8835, + "theoretical_loss": 3.7708523072599007, + "tokens_seen": 715921408 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1159922, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1537914276123047, + "objective/train/theoretical_loss": 3.770834853367246, + "objective/train/tokens_used": 736414176, + "theoretical_loss": 3.770834853367246, + "tokens_seen": 715954176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039548645937813445, + "loss": 3.0888, + "theoretical_loss": 3.7708174004970703, + "tokens_seen": 715986944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003954764292878636, + "loss": 2.7925, + "theoretical_loss": 3.7707824978237277, + "tokens_seen": 716052480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003954663991975928, + "loss": 3.04, + "theoretical_loss": 3.7707475992390203, + "tokens_seen": 716118016 + }, + { + "epoch": 2.01, + "learning_rate": 0.000395456369107322, + "loss": 2.8616, + "theoretical_loss": 3.7707127047420936, + "tokens_seen": 716183552 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003954463390170512, + "loss": 2.8648, + "theoretical_loss": 3.770677814332097, + "tokens_seen": 716249088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039543630892678036, + "loss": 3.0475, + "theoretical_loss": 3.7706429280081752, + "tokens_seen": 716314624 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039542627883650954, + "loss": 3.0609, + "theoretical_loss": 3.770608045769478, + "tokens_seen": 716380160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003954162487462387, + "loss": 3.0457, + "theoretical_loss": 3.7705731676151526, + "tokens_seen": 716445696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039540621865596795, + "loss": 2.8729, + "theoretical_loss": 3.7705382935443468, + "tokens_seen": 716511232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953961885656971, + "loss": 2.9342, + "theoretical_loss": 3.77050342355621, + "tokens_seen": 716576768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953861584754263, + "loss": 3.0427, + "theoretical_loss": 3.77046855764989, + "tokens_seen": 716642304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039537612838515544, + "loss": 2.9942, + "theoretical_loss": 3.7704336958245364, + "tokens_seen": 716707840 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953660982948847, + "loss": 2.9365, + "theoretical_loss": 3.7703988380792985, + "tokens_seen": 716773376 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039535606820461386, + "loss": 3.0326, + "theoretical_loss": 3.7703639844133257, + "tokens_seen": 716838912 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039534603811434304, + "loss": 3.0179, + "theoretical_loss": 3.7703291348257673, + "tokens_seen": 716904448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953360080240722, + "loss": 3.0386, + "theoretical_loss": 3.770294289315774, + "tokens_seen": 716969984 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953259779338014, + "loss": 2.8388, + "theoretical_loss": 3.770259447882495, + "tokens_seen": 717035520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953159478435306, + "loss": 3.0852, + "theoretical_loss": 3.770224610525082, + "tokens_seen": 717101056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003953059177532598, + "loss": 2.9868, + "theoretical_loss": 3.7701897772426847, + "tokens_seen": 717166592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039529588766298895, + "loss": 2.8844, + "theoretical_loss": 3.770154948034455, + "tokens_seen": 717232128 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952858575727182, + "loss": 3.0807, + "theoretical_loss": 3.7701201228995433, + "tokens_seen": 717297664 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039527582748244736, + "loss": 2.8964, + "theoretical_loss": 3.7700853018371023, + "tokens_seen": 717363200 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039526579739217654, + "loss": 2.9108, + "theoretical_loss": 3.770050484846283, + "tokens_seen": 717428736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952557673019057, + "loss": 2.8788, + "theoretical_loss": 3.770015671926237, + "tokens_seen": 717494272 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952457372116349, + "loss": 3.1625, + "theoretical_loss": 3.7699808630761185, + "tokens_seen": 717559808 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1162641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.032407283782959, + "objective/train/theoretical_loss": 3.769963460177016, + "objective/train/tokens_used": 738052576, + "theoretical_loss": 3.769963460177016, + "tokens_seen": 717592576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952357071213641, + "loss": 3.1103, + "theoretical_loss": 3.7699460582950772, + "tokens_seen": 717625344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952256770310933, + "loss": 3.0923, + "theoretical_loss": 3.769911257582268, + "tokens_seen": 717690880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039521564694082245, + "loss": 3.0958, + "theoretical_loss": 3.7698764609368434, + "tokens_seen": 717756416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003952056168505517, + "loss": 2.7655, + "theoretical_loss": 3.7698416683579565, + "tokens_seen": 717821952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003951955867602808, + "loss": 3.0104, + "theoretical_loss": 3.769806879844761, + "tokens_seen": 717887488 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039518555667001005, + "loss": 3.1949, + "theoretical_loss": 3.7697720953964104, + "tokens_seen": 717953024 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039517552657973923, + "loss": 3.1883, + "theoretical_loss": 3.7697373150120597, + "tokens_seen": 718018560 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003951654964894684, + "loss": 2.994, + "theoretical_loss": 3.7697025386908622, + "tokens_seen": 718084096 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003951554663991976, + "loss": 2.9508, + "theoretical_loss": 3.769667766431973, + "tokens_seen": 718149632 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039514543630892677, + "loss": 3.021, + "theoretical_loss": 3.7696329982345467, + "tokens_seen": 718215168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039513540621865595, + "loss": 2.8766, + "theoretical_loss": 3.7695982340977388, + "tokens_seen": 718280704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003951253761283852, + "loss": 3.0467, + "theoretical_loss": 3.7695634740207034, + "tokens_seen": 718346240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003951153460381143, + "loss": 3.1006, + "theoretical_loss": 3.7695287180025976, + "tokens_seen": 718411776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039510531594784355, + "loss": 3.0642, + "theoretical_loss": 3.7694939660425764, + "tokens_seen": 718477312 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039509528585757273, + "loss": 2.8547, + "theoretical_loss": 3.7694592181397963, + "tokens_seen": 718542848 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950852557673019, + "loss": 3.0401, + "theoretical_loss": 3.7694244742934133, + "tokens_seen": 718608384 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950752256770311, + "loss": 2.937, + "theoretical_loss": 3.7693897345025844, + "tokens_seen": 718673920 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950651955867603, + "loss": 2.799, + "theoretical_loss": 3.769354998766466, + "tokens_seen": 718739456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039505516549648946, + "loss": 2.9098, + "theoretical_loss": 3.7693202670842147, + "tokens_seen": 718804992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950451354062187, + "loss": 3.0572, + "theoretical_loss": 3.769285539454989, + "tokens_seen": 718870528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950351053159478, + "loss": 2.9004, + "theoretical_loss": 3.7692508158779465, + "tokens_seen": 718936064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039502507522567705, + "loss": 2.8599, + "theoretical_loss": 3.769216096352244, + "tokens_seen": 719001600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950150451354062, + "loss": 3.0419, + "theoretical_loss": 3.7691813808770407, + "tokens_seen": 719067136 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003950050150451354, + "loss": 2.9706, + "theoretical_loss": 3.769146669451494, + "tokens_seen": 719132672 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949949849548646, + "loss": 3.0014, + "theoretical_loss": 3.7691119620747626, + "tokens_seen": 719198208 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1165519, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1965038776397705, + "objective/train/theoretical_loss": 3.7690946099044402, + "objective/train/tokens_used": 739690976, + "theoretical_loss": 3.7690946099044402, + "tokens_seen": 719230976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949849548645938, + "loss": 3.027, + "theoretical_loss": 3.769077258746006, + "tokens_seen": 719263744 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039497492477432296, + "loss": 2.86, + "theoretical_loss": 3.7690425594643835, + "tokens_seen": 719329280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949648946840522, + "loss": 3.0238, + "theoretical_loss": 3.769007864229054, + "tokens_seen": 719394816 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949548645937813, + "loss": 3.0357, + "theoretical_loss": 3.768973173039176, + "tokens_seen": 719460352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039494483450351056, + "loss": 3.0076, + "theoretical_loss": 3.7689384858939112, + "tokens_seen": 719525888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949348044132397, + "loss": 2.998, + "theoretical_loss": 3.768903802792419, + "tokens_seen": 719591424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949247743229689, + "loss": 3.0795, + "theoretical_loss": 3.768869123733859, + "tokens_seen": 719656960 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949147442326981, + "loss": 3.0192, + "theoretical_loss": 3.768834448717393, + "tokens_seen": 719722496 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003949047141424273, + "loss": 3.0543, + "theoretical_loss": 3.7687997777421822, + "tokens_seen": 719788032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003948946840521565, + "loss": 3.021, + "theoretical_loss": 3.7687651108073856, + "tokens_seen": 719853568 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039488465396188564, + "loss": 3.0451, + "theoretical_loss": 3.7687304479121666, + "tokens_seen": 719919104 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003948746238716149, + "loss": 2.8847, + "theoretical_loss": 3.768695789055686, + "tokens_seen": 719984640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039486459378134406, + "loss": 2.944, + "theoretical_loss": 3.768661134237106, + "tokens_seen": 720050176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039485456369107324, + "loss": 2.9702, + "theoretical_loss": 3.7686264834555883, + "tokens_seen": 720115712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003948445336008024, + "loss": 2.9392, + "theoretical_loss": 3.7685918367102955, + "tokens_seen": 720181248 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003948345035105316, + "loss": 3.0581, + "theoretical_loss": 3.7685571940003904, + "tokens_seen": 720246784 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003948244734202608, + "loss": 3.0891, + "theoretical_loss": 3.7685225553250348, + "tokens_seen": 720312320 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039481444332999, + "loss": 2.8273, + "theoretical_loss": 3.7684879206833934, + "tokens_seen": 720377856 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039480441323971915, + "loss": 3.0532, + "theoretical_loss": 3.7684532900746284, + "tokens_seen": 720443392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947943831494484, + "loss": 3.1065, + "theoretical_loss": 3.768418663497904, + "tokens_seen": 720508928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039478435305917756, + "loss": 2.9055, + "theoretical_loss": 3.7683840409523848, + "tokens_seen": 720574464 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039477432296890674, + "loss": 3.0955, + "theoretical_loss": 3.768349422437233, + "tokens_seen": 720640000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947642928786359, + "loss": 3.0392, + "theoretical_loss": 3.768314807951614, + "tokens_seen": 720705536 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947542627883651, + "loss": 2.9683, + "theoretical_loss": 3.7682801974946933, + "tokens_seen": 720771072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947442326980943, + "loss": 2.9168, + "theoretical_loss": 3.768245591065634, + "tokens_seen": 720836608 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1168349, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7981910705566406, + "objective/train/theoretical_loss": 3.768228289361292, + "objective/train/tokens_used": 741329376, + "theoretical_loss": 3.768228289361292, + "tokens_seen": 720869376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947342026078235, + "loss": 3.091, + "theoretical_loss": 3.7682109886636024, + "tokens_seen": 720902144 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039472417251755265, + "loss": 2.9628, + "theoretical_loss": 3.7681763902877634, + "tokens_seen": 720967680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003947141424272819, + "loss": 3.0521, + "theoretical_loss": 3.768141795937283, + "tokens_seen": 721033216 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394704112337011, + "loss": 3.0029, + "theoretical_loss": 3.768107205611327, + "tokens_seen": 721098752 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039469408224674025, + "loss": 3.183, + "theoretical_loss": 3.768072619309061, + "tokens_seen": 721164288 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039468405215646943, + "loss": 3.0307, + "theoretical_loss": 3.768038037029652, + "tokens_seen": 721229824 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003946740220661986, + "loss": 3.1676, + "theoretical_loss": 3.7680034587722666, + "tokens_seen": 721295360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003946639919759278, + "loss": 2.9899, + "theoretical_loss": 3.767968884536071, + "tokens_seen": 721360896 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039465396188565697, + "loss": 2.9757, + "theoretical_loss": 3.7679343143202324, + "tokens_seen": 721426432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039464393179538615, + "loss": 2.9773, + "theoretical_loss": 3.7678997481239187, + "tokens_seen": 721491968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003946339017051154, + "loss": 2.9773, + "theoretical_loss": 3.767865185946297, + "tokens_seen": 721557504 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003946238716148445, + "loss": 3.1097, + "theoretical_loss": 3.7678306277865357, + "tokens_seen": 721623040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039461384152457375, + "loss": 2.9921, + "theoretical_loss": 3.7677960736438023, + "tokens_seen": 721688576 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039460381143430293, + "loss": 3.015, + "theoretical_loss": 3.7677615235172652, + "tokens_seen": 721754112 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945937813440321, + "loss": 2.9632, + "theoretical_loss": 3.7677269774060935, + "tokens_seen": 721819648 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945837512537613, + "loss": 3.0617, + "theoretical_loss": 3.767692435309456, + "tokens_seen": 721885184 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945737211634905, + "loss": 3.074, + "theoretical_loss": 3.767657897226521, + "tokens_seen": 721950720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039456369107321966, + "loss": 2.972, + "theoretical_loss": 3.767623363156458, + "tokens_seen": 722016256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945536609829489, + "loss": 2.8683, + "theoretical_loss": 3.767588833098438, + "tokens_seen": 722081792 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394543630892678, + "loss": 3.0255, + "theoretical_loss": 3.767554307051629, + "tokens_seen": 722147328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039453360080240725, + "loss": 3.0124, + "theoretical_loss": 3.7675197850152022, + "tokens_seen": 722212864 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945235707121364, + "loss": 3.07, + "theoretical_loss": 3.7674852669883276, + "tokens_seen": 722278400 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945135406218656, + "loss": 2.9092, + "theoretical_loss": 3.7674507529701753, + "tokens_seen": 722343936 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003945035105315948, + "loss": 3.0751, + "theoretical_loss": 3.767416242959917, + "tokens_seen": 722409472 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394493480441324, + "loss": 3.1033, + "theoretical_loss": 3.767381736956723, + "tokens_seen": 722475008 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1169764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.271573781967163, + "objective/train/theoretical_loss": 3.767364485457516, + "objective/train/tokens_used": 742967776, + "theoretical_loss": 3.767364485457516, + "tokens_seen": 722507776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039448345035105316, + "loss": 2.9502, + "theoretical_loss": 3.7673472349597645, + "tokens_seen": 722540544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944734202607824, + "loss": 2.9166, + "theoretical_loss": 3.767312736968214, + "tokens_seen": 722606080 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944633901705115, + "loss": 2.9167, + "theoretical_loss": 3.7672782429812433, + "tokens_seen": 722671616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039445336008024076, + "loss": 2.8885, + "theoretical_loss": 3.7672437529980227, + "tokens_seen": 722737152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944433299899699, + "loss": 2.8351, + "theoretical_loss": 3.7672092670177264, + "tokens_seen": 722802688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944332998996991, + "loss": 3.0676, + "theoretical_loss": 3.767174785039526, + "tokens_seen": 722868224 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944232698094283, + "loss": 3.0468, + "theoretical_loss": 3.767140307062595, + "tokens_seen": 722933760 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003944132397191575, + "loss": 2.9273, + "theoretical_loss": 3.767105833086105, + "tokens_seen": 722999296 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039440320962888666, + "loss": 2.9165, + "theoretical_loss": 3.767071363109231, + "tokens_seen": 723064832 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039439317953861584, + "loss": 3.006, + "theoretical_loss": 3.7670368971311454, + "tokens_seen": 723130368 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394383149448345, + "loss": 3.028, + "theoretical_loss": 3.7670024351510225, + "tokens_seen": 723195904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039437311935807426, + "loss": 2.9205, + "theoretical_loss": 3.7669679771680364, + "tokens_seen": 723261440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003943630892678034, + "loss": 3.0216, + "theoretical_loss": 3.7669335231813603, + "tokens_seen": 723326976 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003943530591775326, + "loss": 2.8688, + "theoretical_loss": 3.7668990731901704, + "tokens_seen": 723392512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039434302908726175, + "loss": 3.1076, + "theoretical_loss": 3.76686462719364, + "tokens_seen": 723458048 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394332998996991, + "loss": 2.9989, + "theoretical_loss": 3.766830185190945, + "tokens_seen": 723523584 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039432296890672017, + "loss": 3.009, + "theoretical_loss": 3.76679574718126, + "tokens_seen": 723589120 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039431293881644935, + "loss": 2.9169, + "theoretical_loss": 3.766761313163761, + "tokens_seen": 723654656 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039430290872617853, + "loss": 2.9348, + "theoretical_loss": 3.7667268831376237, + "tokens_seen": 723720192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039429287863590776, + "loss": 2.9618, + "theoretical_loss": 3.7666924571020237, + "tokens_seen": 723785728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003942828485456369, + "loss": 2.8006, + "theoretical_loss": 3.766658035056137, + "tokens_seen": 723851264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003942728184553661, + "loss": 2.9821, + "theoretical_loss": 3.7666236169991407, + "tokens_seen": 723916800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039426278836509525, + "loss": 2.9601, + "theoretical_loss": 3.766589202930211, + "tokens_seen": 723982336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003942527582748245, + "loss": 2.9411, + "theoretical_loss": 3.7665547928485257, + "tokens_seen": 724047872 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039424272818455367, + "loss": 3.0685, + "theoretical_loss": 3.7665203867532613, + "tokens_seen": 724113408 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1173596, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.747535467147827, + "objective/train/theoretical_loss": 3.76650318520028, + "objective/train/tokens_used": 744606176, + "theoretical_loss": 3.76650318520028, + "tokens_seen": 724146176 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039423269809428285, + "loss": 2.8023, + "theoretical_loss": 3.7664859846435954, + "tokens_seen": 724178944 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039422266800401203, + "loss": 3.0073, + "theoretical_loss": 3.7664515865187056, + "tokens_seen": 724244480 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003942126379137412, + "loss": 2.9837, + "theoretical_loss": 3.7664171923777694, + "tokens_seen": 724310016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003942026078234704, + "loss": 2.8445, + "theoretical_loss": 3.766382802219966, + "tokens_seen": 724375552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039419257773319963, + "loss": 3.1356, + "theoretical_loss": 3.7663484160444733, + "tokens_seen": 724441088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039418254764292876, + "loss": 3.0142, + "theoretical_loss": 3.76631403385047, + "tokens_seen": 724506624 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394172517552658, + "loss": 2.8943, + "theoretical_loss": 3.7662796556371347, + "tokens_seen": 724572160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003941624874623871, + "loss": 2.894, + "theoretical_loss": 3.766245281403647, + "tokens_seen": 724637696 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039415245737211635, + "loss": 2.9139, + "theoretical_loss": 3.7662109111491864, + "tokens_seen": 724703232 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003941424272818456, + "loss": 2.8219, + "theoretical_loss": 3.7661765448729314, + "tokens_seen": 724768768 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003941323971915747, + "loss": 2.9628, + "theoretical_loss": 3.7661421825740633, + "tokens_seen": 724834304 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039412236710130395, + "loss": 3.0325, + "theoretical_loss": 3.7661078242517614, + "tokens_seen": 724899840 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039411233701103313, + "loss": 2.9282, + "theoretical_loss": 3.7660734699052067, + "tokens_seen": 724965376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003941023069207623, + "loss": 3.1457, + "theoretical_loss": 3.7660391195335787, + "tokens_seen": 725030912 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940922768304915, + "loss": 2.9857, + "theoretical_loss": 3.7660047731360597, + "tokens_seen": 725096448 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940822467402207, + "loss": 2.9375, + "theoretical_loss": 3.7659704307118296, + "tokens_seen": 725161984 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039407221664994986, + "loss": 2.8628, + "theoretical_loss": 3.7659360922600706, + "tokens_seen": 725227520 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940621865596791, + "loss": 2.9755, + "theoretical_loss": 3.7659017577799636, + "tokens_seen": 725293056 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940521564694082, + "loss": 3.0269, + "theoretical_loss": 3.7658674272706905, + "tokens_seen": 725358592 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039404212637913745, + "loss": 2.9141, + "theoretical_loss": 3.7658331007314336, + "tokens_seen": 725424128 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940320962888666, + "loss": 3.0412, + "theoretical_loss": 3.7657987781613755, + "tokens_seen": 725489664 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940220661985958, + "loss": 2.8424, + "theoretical_loss": 3.765764459559698, + "tokens_seen": 725555200 + }, + { + "epoch": 2.01, + "learning_rate": 0.000394012036108325, + "loss": 2.9082, + "theoretical_loss": 3.765730144925585, + "tokens_seen": 725620736 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003940020060180542, + "loss": 3.0031, + "theoretical_loss": 3.765695834258218, + "tokens_seen": 725686272 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039399197592778336, + "loss": 3.0079, + "theoretical_loss": 3.765661527556782, + "tokens_seen": 725751808 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1175012, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0700247287750244, + "objective/train/theoretical_loss": 3.7656443756930322, + "objective/train/tokens_used": 746244576, + "theoretical_loss": 3.7656443756930322, + "tokens_seen": 725784576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939819458375126, + "loss": 2.9299, + "theoretical_loss": 3.765627224820459, + "tokens_seen": 725817344 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939719157472417, + "loss": 2.919, + "theoretical_loss": 3.7655929260484338, + "tokens_seen": 725882880 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039396188565697096, + "loss": 3.0735, + "theoretical_loss": 3.765558631239889, + "tokens_seen": 725948416 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939518555667001, + "loss": 3.0203, + "theoretical_loss": 3.7655243403940113, + "tokens_seen": 726013952 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939418254764293, + "loss": 2.8896, + "theoretical_loss": 3.765490053509983, + "tokens_seen": 726079488 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939317953861585, + "loss": 2.8783, + "theoretical_loss": 3.7654557705869895, + "tokens_seen": 726145024 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003939217652958877, + "loss": 2.9077, + "theoretical_loss": 3.765421491624216, + "tokens_seen": 726210560 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039391173520561686, + "loss": 2.9235, + "theoretical_loss": 3.765387216620848, + "tokens_seen": 726276096 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039390170511534604, + "loss": 2.9358, + "theoretical_loss": 3.76535294557607, + "tokens_seen": 726341632 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003938916750250752, + "loss": 2.9852, + "theoretical_loss": 3.7653186784890678, + "tokens_seen": 726407168 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039388164493480446, + "loss": 2.6646, + "theoretical_loss": 3.765284415359028, + "tokens_seen": 726472704 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003938716148445336, + "loss": 2.7829, + "theoretical_loss": 3.765250156185137, + "tokens_seen": 726538240 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003938615847542628, + "loss": 3.0274, + "theoretical_loss": 3.76521590096658, + "tokens_seen": 726603776 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039385155466399195, + "loss": 3.0716, + "theoretical_loss": 3.7651816497025443, + "tokens_seen": 726669312 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003938415245737212, + "loss": 3.1522, + "theoretical_loss": 3.765147402392217, + "tokens_seen": 726734848 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039383149448345037, + "loss": 3.0922, + "theoretical_loss": 3.7651131590347857, + "tokens_seen": 726800384 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039382146439317955, + "loss": 2.8843, + "theoretical_loss": 3.7650789196294365, + "tokens_seen": 726865920 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039381143430290873, + "loss": 2.8096, + "theoretical_loss": 3.7650446841753578, + "tokens_seen": 726931456 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039380140421263796, + "loss": 3.0124, + "theoretical_loss": 3.765010452671737, + "tokens_seen": 726996992 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003937913741223671, + "loss": 2.9722, + "theoretical_loss": 3.7649762251177634, + "tokens_seen": 727062528 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003937813440320963, + "loss": 3.1579, + "theoretical_loss": 3.7649420015126234, + "tokens_seen": 727128064 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039377131394182545, + "loss": 2.9445, + "theoretical_loss": 3.7649077818555075, + "tokens_seen": 727193600 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003937612838515547, + "loss": 3.0278, + "theoretical_loss": 3.764873566145603, + "tokens_seen": 727259136 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039375125376128387, + "loss": 2.9434, + "theoretical_loss": 3.7648393543821, + "tokens_seen": 727324672 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039374122367101305, + "loss": 2.9412, + "theoretical_loss": 3.7648051465641874, + "tokens_seen": 727390208 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1178001, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.257294178009033, + "objective/train/theoretical_loss": 3.7647880441345745, + "objective/train/tokens_used": 747882976, + "theoretical_loss": 3.7647880441345745, + "tokens_seen": 727422976 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039373119358074223, + "loss": 3.0952, + "theoretical_loss": 3.764770942691055, + "tokens_seen": 727455744 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003937211634904714, + "loss": 3.029, + "theoretical_loss": 3.7647367427618916, + "tokens_seen": 727521280 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003937111334002006, + "loss": 2.9178, + "theoretical_loss": 3.7647025467758883, + "tokens_seen": 727586816 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039370110330992983, + "loss": 3.0031, + "theoretical_loss": 3.764668354732235, + "tokens_seen": 727652352 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039369107321965896, + "loss": 3.1494, + "theoretical_loss": 3.764634166630122, + "tokens_seen": 727717888 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003936810431293882, + "loss": 3.0656, + "theoretical_loss": 3.7645999824687406, + "tokens_seen": 727783424 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003936710130391173, + "loss": 3.0214, + "theoretical_loss": 3.764565802247281, + "tokens_seen": 727848960 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039366098294884655, + "loss": 2.927, + "theoretical_loss": 3.764531625964935, + "tokens_seen": 727914496 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039365095285857573, + "loss": 3.074, + "theoretical_loss": 3.7644974536208933, + "tokens_seen": 727980032 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003936409227683049, + "loss": 2.7833, + "theoretical_loss": 3.764463285214349, + "tokens_seen": 728045568 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003936308926780341, + "loss": 2.9626, + "theoretical_loss": 3.7644291207444924, + "tokens_seen": 728111104 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039362086258776333, + "loss": 2.8981, + "theoretical_loss": 3.7643949602105167, + "tokens_seen": 728176640 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039361083249749246, + "loss": 3.0587, + "theoretical_loss": 3.764360803611614, + "tokens_seen": 728242176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003936008024072217, + "loss": 3.018, + "theoretical_loss": 3.764326650946977, + "tokens_seen": 728307712 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935907723169508, + "loss": 2.9537, + "theoretical_loss": 3.7642925022157985, + "tokens_seen": 728373248 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039358074222668006, + "loss": 2.9691, + "theoretical_loss": 3.7642583574172717, + "tokens_seen": 728438784 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039357071213640924, + "loss": 2.8712, + "theoretical_loss": 3.7642242165505895, + "tokens_seen": 728504320 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935606820461384, + "loss": 2.9357, + "theoretical_loss": 3.764190079614946, + "tokens_seen": 728569856 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935506519558676, + "loss": 3.0412, + "theoretical_loss": 3.7641559466095353, + "tokens_seen": 728635392 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935406218655968, + "loss": 3.0092, + "theoretical_loss": 3.764121817533551, + "tokens_seen": 728700928 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039353059177532596, + "loss": 2.9401, + "theoretical_loss": 3.7640876923861875, + "tokens_seen": 728766464 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935205616850552, + "loss": 2.9671, + "theoretical_loss": 3.764053571166639, + "tokens_seen": 728832000 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003935105315947843, + "loss": 3.0154, + "theoretical_loss": 3.764019453874101, + "tokens_seen": 728897536 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039350050150451356, + "loss": 3.0128, + "theoretical_loss": 3.7639853405077677, + "tokens_seen": 728963072 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934904714142427, + "loss": 3.1041, + "theoretical_loss": 3.7639512310668355, + "tokens_seen": 729028608 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1181058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9340803623199463, + "objective/train/theoretical_loss": 3.7639341778181428, + "objective/train/tokens_used": 749521376, + "theoretical_loss": 3.7639341778181428, + "tokens_seen": 729061376 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934804413239719, + "loss": 2.9747, + "theoretical_loss": 3.763917125550499, + "tokens_seen": 729094144 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934704112337011, + "loss": 2.9563, + "theoretical_loss": 3.7638830239579537, + "tokens_seen": 729159680 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934603811434303, + "loss": 2.9894, + "theoretical_loss": 3.7638489262883965, + "tokens_seen": 729225216 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039345035105315946, + "loss": 2.9895, + "theoretical_loss": 3.763814832541023, + "tokens_seen": 729290752 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934403209628887, + "loss": 3.0263, + "theoretical_loss": 3.7637807427150296, + "tokens_seen": 729356288 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039343029087261783, + "loss": 2.9748, + "theoretical_loss": 3.7637466568096127, + "tokens_seen": 729421824 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039342026078234706, + "loss": 3.1357, + "theoretical_loss": 3.7637125748239697, + "tokens_seen": 729487360 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934102306920762, + "loss": 3.0058, + "theoretical_loss": 3.763678496757298, + "tokens_seen": 729552896 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003934002006018054, + "loss": 3.1418, + "theoretical_loss": 3.7636444226087953, + "tokens_seen": 729618432 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039339017051153466, + "loss": 2.9299, + "theoretical_loss": 3.763610352377658, + "tokens_seen": 729683968 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003933801404212638, + "loss": 3.0411, + "theoretical_loss": 3.763576286063085, + "tokens_seen": 729749504 + }, + { + "epoch": 2.01, + "learning_rate": 0.000393370110330993, + "loss": 2.8555, + "theoretical_loss": 3.763542223664273, + "tokens_seen": 729815040 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039336008024072215, + "loss": 2.9758, + "theoretical_loss": 3.763508165180422, + "tokens_seen": 729880576 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003933500501504514, + "loss": 2.9351, + "theoretical_loss": 3.76347411061073, + "tokens_seen": 729946112 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039334002006018057, + "loss": 2.9464, + "theoretical_loss": 3.7634400599543953, + "tokens_seen": 730011648 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039332998996990975, + "loss": 2.9003, + "theoretical_loss": 3.7634060132106173, + "tokens_seen": 730077184 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039331995987963893, + "loss": 3.0644, + "theoretical_loss": 3.7633719703785955, + "tokens_seen": 730142720 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039330992978936816, + "loss": 3.0175, + "theoretical_loss": 3.7633379314575293, + "tokens_seen": 730208256 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003932998996990973, + "loss": 2.9642, + "theoretical_loss": 3.763303896446618, + "tokens_seen": 730273792 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003932898696088265, + "loss": 3.0552, + "theoretical_loss": 3.7632698653450625, + "tokens_seen": 730339328 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039327983951855565, + "loss": 2.9439, + "theoretical_loss": 3.763235838152062, + "tokens_seen": 730404864 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003932698094282849, + "loss": 2.9574, + "theoretical_loss": 3.763201814866817, + "tokens_seen": 730470400 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039325977933801407, + "loss": 2.8714, + "theoretical_loss": 3.7631677954885294, + "tokens_seen": 730535936 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039324974924774325, + "loss": 2.9765, + "theoretical_loss": 3.7631337800163993, + "tokens_seen": 730601472 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039323971915747243, + "loss": 3.0144, + "theoretical_loss": 3.763099768449628, + "tokens_seen": 730667008 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1183729, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0536599159240723, + "objective/train/theoretical_loss": 3.763082764130502, + "objective/train/tokens_used": 751159776, + "theoretical_loss": 3.763082764130502, + "tokens_seen": 730699776 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003932296890672016, + "loss": 2.7989, + "theoretical_loss": 3.7630657607874163, + "tokens_seen": 730732544 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003932196589769308, + "loss": 2.8006, + "theoretical_loss": 3.7630317570289664, + "tokens_seen": 730798080 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039320962888666003, + "loss": 3.0067, + "theoretical_loss": 3.7629977571734807, + "tokens_seen": 730863616 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039319959879638916, + "loss": 3.056, + "theoretical_loss": 3.76296376122016, + "tokens_seen": 730929152 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003931895687061184, + "loss": 3.0004, + "theoretical_loss": 3.762929769168208, + "tokens_seen": 730994688 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003931795386158475, + "loss": 2.9568, + "theoretical_loss": 3.7628957810168258, + "tokens_seen": 731060224 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039316950852557675, + "loss": 2.8515, + "theoretical_loss": 3.7628617967652174, + "tokens_seen": 731125760 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039315947843530593, + "loss": 2.9819, + "theoretical_loss": 3.7628278164125852, + "tokens_seen": 731191296 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003931494483450351, + "loss": 3.089, + "theoretical_loss": 3.762793839958133, + "tokens_seen": 731256832 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003931394182547643, + "loss": 3.0008, + "theoretical_loss": 3.7627598674010643, + "tokens_seen": 731322368 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039312938816449353, + "loss": 2.7548, + "theoretical_loss": 3.762725898740582, + "tokens_seen": 731387904 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039311935807422266, + "loss": 3.0958, + "theoretical_loss": 3.762691933975891, + "tokens_seen": 731453440 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003931093279839519, + "loss": 3.1247, + "theoretical_loss": 3.7626579731061955, + "tokens_seen": 731518976 + }, + { + "epoch": 2.01, + "learning_rate": 0.000393099297893681, + "loss": 3.0943, + "theoretical_loss": 3.7626240161306987, + "tokens_seen": 731584512 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039308926780341026, + "loss": 3.0574, + "theoretical_loss": 3.7625900630486067, + "tokens_seen": 731650048 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039307923771313944, + "loss": 2.8764, + "theoretical_loss": 3.762556113859124, + "tokens_seen": 731715584 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003930692076228686, + "loss": 3.0402, + "theoretical_loss": 3.762522168561455, + "tokens_seen": 731781120 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003930591775325978, + "loss": 2.8586, + "theoretical_loss": 3.7624882271548064, + "tokens_seen": 731846656 + }, + { + "epoch": 2.01, + "learning_rate": 0.000393049147442327, + "loss": 3.0565, + "theoretical_loss": 3.7624542896383826, + "tokens_seen": 731912192 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039303911735205616, + "loss": 3.0302, + "theoretical_loss": 3.7624203560113902, + "tokens_seen": 731977728 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003930290872617854, + "loss": 2.9733, + "theoretical_loss": 3.7623864262730353, + "tokens_seen": 732043264 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003930190571715145, + "loss": 2.9608, + "theoretical_loss": 3.762352500422524, + "tokens_seen": 732108800 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039300902708124376, + "loss": 3.1759, + "theoretical_loss": 3.7623185784590625, + "tokens_seen": 732174336 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929989969909729, + "loss": 2.8913, + "theoretical_loss": 3.7622846603818574, + "tokens_seen": 732239872 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929889669007021, + "loss": 3.1156, + "theoretical_loss": 3.7622507461901167, + "tokens_seen": 732305408 + }, + { + "epoch": 2.01, + "objective/train/docs_used": 1186092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1636579036712646, + "objective/train/theoretical_loss": 3.762233790551048, + "objective/train/tokens_used": 752798176, + "theoretical_loss": 3.762233790551048, + "tokens_seen": 732338176 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929789368104313, + "loss": 2.8965, + "theoretical_loss": 3.7622168358830472, + "tokens_seen": 732370944 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929689067201605, + "loss": 2.9715, + "theoretical_loss": 3.762182929459856, + "tokens_seen": 732436480 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039295887662988967, + "loss": 3.1229, + "theoretical_loss": 3.7621490269197517, + "tokens_seen": 732502016 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929488465396189, + "loss": 3.062, + "theoretical_loss": 3.7621151282619407, + "tokens_seen": 732567552 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039293881644934803, + "loss": 3.0581, + "theoretical_loss": 3.762081233485633, + "tokens_seen": 732633088 + }, + { + "epoch": 2.01, + "learning_rate": 0.00039292878635907726, + "loss": 2.9432, + "theoretical_loss": 3.7620473425900354, + "tokens_seen": 732698624 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929187562688064, + "loss": 2.9752, + "theoretical_loss": 3.762013455574358, + "tokens_seen": 732764160 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003929087261785356, + "loss": 3.0003, + "theoretical_loss": 3.7619795724378085, + "tokens_seen": 732829696 + }, + { + "epoch": 2.01, + "learning_rate": 0.0003928986960882648, + "loss": 3.0065, + "theoretical_loss": 3.7619456931795963, + "tokens_seen": 732895232 + }, + { + "epoch": 2.02, + "learning_rate": 0.000392888665997994, + "loss": 3.0111, + "theoretical_loss": 3.7619118177989312, + "tokens_seen": 732960768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039287863590772317, + "loss": 2.8993, + "theoretical_loss": 3.761877946295022, + "tokens_seen": 733026304 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039286860581745235, + "loss": 2.8777, + "theoretical_loss": 3.761844078667079, + "tokens_seen": 733091840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039285857572718153, + "loss": 2.9872, + "theoretical_loss": 3.7618102149143127, + "tokens_seen": 733157376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039284854563691077, + "loss": 2.9684, + "theoretical_loss": 3.7617763550359324, + "tokens_seen": 733222912 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003928385155466399, + "loss": 2.9447, + "theoretical_loss": 3.7617424990311488, + "tokens_seen": 733288448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039282848545636913, + "loss": 3.0999, + "theoretical_loss": 3.761708646899173, + "tokens_seen": 733353984 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039281845536609825, + "loss": 2.9589, + "theoretical_loss": 3.7616747986392154, + "tokens_seen": 733419520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003928084252758275, + "loss": 2.9288, + "theoretical_loss": 3.7616409542504883, + "tokens_seen": 733485056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039279839518555667, + "loss": 3.0479, + "theoretical_loss": 3.761607113732202, + "tokens_seen": 733550592 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039278836509528585, + "loss": 2.8225, + "theoretical_loss": 3.7615732770835684, + "tokens_seen": 733616128 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039277833500501503, + "loss": 2.9206, + "theoretical_loss": 3.7615394443038, + "tokens_seen": 733681664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039276830491474427, + "loss": 2.816, + "theoretical_loss": 3.761505615392108, + "tokens_seen": 733747200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003927582748244734, + "loss": 2.9274, + "theoretical_loss": 3.7614717903477053, + "tokens_seen": 733812736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039274824473420263, + "loss": 2.8954, + "theoretical_loss": 3.761437969169804, + "tokens_seen": 733878272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039273821464393176, + "loss": 3.0538, + "theoretical_loss": 3.761404151857618, + "tokens_seen": 733943808 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1188863, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.175291061401367, + "objective/train/theoretical_loss": 3.7613872446509213, + "objective/train/tokens_used": 754436576, + "theoretical_loss": 3.7613872446509213, + "tokens_seen": 733976576 + }, + { + "epoch": 2.02, + "learning_rate": 0.000392728184553661, + "loss": 3.0636, + "theoretical_loss": 3.7613703384103587, + "tokens_seen": 734009344 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003927181544633902, + "loss": 3.0616, + "theoretical_loss": 3.7613365288272407, + "tokens_seen": 734074880 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039270812437311936, + "loss": 2.9298, + "theoretical_loss": 3.7613027231074767, + "tokens_seen": 734140416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039269809428284854, + "loss": 3.1064, + "theoretical_loss": 3.7612689212502812, + "tokens_seen": 734205952 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003926880641925777, + "loss": 3.0167, + "theoretical_loss": 3.7612351232548673, + "tokens_seen": 734271488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003926780341023069, + "loss": 2.9981, + "theoretical_loss": 3.76120132912045, + "tokens_seen": 734337024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039266800401203613, + "loss": 2.9742, + "theoretical_loss": 3.7611675388462427, + "tokens_seen": 734402560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039265797392176526, + "loss": 3.0599, + "theoretical_loss": 3.761133752431461, + "tokens_seen": 734468096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003926479438314945, + "loss": 3.0561, + "theoretical_loss": 3.7610999698753194, + "tokens_seen": 734533632 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039263791374122373, + "loss": 2.8701, + "theoretical_loss": 3.761066191177033, + "tokens_seen": 734599168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039262788365095286, + "loss": 2.9229, + "theoretical_loss": 3.761032416335817, + "tokens_seen": 734664704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003926178535606821, + "loss": 2.9094, + "theoretical_loss": 3.7609986453508872, + "tokens_seen": 734730240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003926078234704112, + "loss": 3.0877, + "theoretical_loss": 3.7609648782214595, + "tokens_seen": 734795776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039259779338014046, + "loss": 2.8903, + "theoretical_loss": 3.7609311149467493, + "tokens_seen": 734861312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039258776328986964, + "loss": 2.907, + "theoretical_loss": 3.7608973555259735, + "tokens_seen": 734926848 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003925777331995988, + "loss": 2.966, + "theoretical_loss": 3.7608635999583484, + "tokens_seen": 734992384 + }, + { + "epoch": 2.02, + "learning_rate": 0.000392567703109328, + "loss": 3.0126, + "theoretical_loss": 3.760829848243091, + "tokens_seen": 735057920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003925576730190572, + "loss": 3.0149, + "theoretical_loss": 3.7607961003794173, + "tokens_seen": 735123456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039254764292878636, + "loss": 3.0955, + "theoretical_loss": 3.760762356366545, + "tokens_seen": 735188992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003925376128385156, + "loss": 2.9997, + "theoretical_loss": 3.760728616203692, + "tokens_seen": 735254528 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003925275827482447, + "loss": 2.9875, + "theoretical_loss": 3.7606948798900754, + "tokens_seen": 735320064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039251755265797396, + "loss": 3.0517, + "theoretical_loss": 3.760661147424913, + "tokens_seen": 735385600 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003925075225677031, + "loss": 2.8286, + "theoretical_loss": 3.760627418807423, + "tokens_seen": 735451136 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924974924774323, + "loss": 2.9761, + "theoretical_loss": 3.760593694036824, + "tokens_seen": 735516672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924874623871615, + "loss": 3.0138, + "theoretical_loss": 3.760559973112334, + "tokens_seen": 735582208 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1191563, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.875781297683716, + "objective/train/theoretical_loss": 3.760543114092136, + "objective/train/tokens_used": 756074976, + "theoretical_loss": 3.760543114092136, + "tokens_seen": 735614976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924774322968907, + "loss": 3.0784, + "theoretical_loss": 3.760526256033172, + "tokens_seen": 735647744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039246740220661987, + "loss": 2.8496, + "theoretical_loss": 3.7604925427985574, + "tokens_seen": 735713280 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924573721163491, + "loss": 2.9972, + "theoretical_loss": 3.760458833407709, + "tokens_seen": 735778816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039244734202607823, + "loss": 2.8898, + "theoretical_loss": 3.760425127859846, + "tokens_seen": 735844352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039243731193580746, + "loss": 3.0277, + "theoretical_loss": 3.7603914261541886, + "tokens_seen": 735909888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924272818455366, + "loss": 3.0297, + "theoretical_loss": 3.7603577282899563, + "tokens_seen": 735975424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003924172517552658, + "loss": 3.1111, + "theoretical_loss": 3.7603240342663695, + "tokens_seen": 736040960 + }, + { + "epoch": 2.02, + "learning_rate": 0.000392407221664995, + "loss": 3.0388, + "theoretical_loss": 3.760290344082648, + "tokens_seen": 736106496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003923971915747242, + "loss": 3.0297, + "theoretical_loss": 3.760256657738014, + "tokens_seen": 736172032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039238716148445337, + "loss": 2.8807, + "theoretical_loss": 3.7602229752316862, + "tokens_seen": 736237568 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039237713139418255, + "loss": 3.0275, + "theoretical_loss": 3.760189296562887, + "tokens_seen": 736303104 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039236710130391173, + "loss": 2.931, + "theoretical_loss": 3.7601556217308376, + "tokens_seen": 736368640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039235707121364097, + "loss": 2.9998, + "theoretical_loss": 3.760121950734759, + "tokens_seen": 736434176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003923470411233701, + "loss": 2.8996, + "theoretical_loss": 3.7600882835738734, + "tokens_seen": 736499712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039233701103309933, + "loss": 3.1816, + "theoretical_loss": 3.760054620247402, + "tokens_seen": 736565248 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039232698094282846, + "loss": 2.9031, + "theoretical_loss": 3.760020960754568, + "tokens_seen": 736630784 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003923169508525577, + "loss": 2.9369, + "theoretical_loss": 3.759987305094593, + "tokens_seen": 736696320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039230692076228687, + "loss": 2.8565, + "theoretical_loss": 3.7599536532666997, + "tokens_seen": 736761856 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039229689067201605, + "loss": 3.0957, + "theoretical_loss": 3.7599200052701116, + "tokens_seen": 736827392 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039228686058174523, + "loss": 3.007, + "theoretical_loss": 3.7598863611040514, + "tokens_seen": 736892928 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039227683049147447, + "loss": 2.9723, + "theoretical_loss": 3.759852720767742, + "tokens_seen": 736958464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003922668004012036, + "loss": 2.9371, + "theoretical_loss": 3.759819084260408, + "tokens_seen": 737024000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039225677031093283, + "loss": 2.9506, + "theoretical_loss": 3.759785451581272, + "tokens_seen": 737089536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039224674022066196, + "loss": 2.9648, + "theoretical_loss": 3.7597518227295588, + "tokens_seen": 737155072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003922367101303912, + "loss": 2.7785, + "theoretical_loss": 3.759718197704492, + "tokens_seen": 737220608 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1194303, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7603251934051514, + "objective/train/theoretical_loss": 3.7597013866267086, + "objective/train/tokens_used": 757713376, + "theoretical_loss": 3.7597013866267086, + "tokens_seen": 737253376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003922266800401204, + "loss": 2.8139, + "theoretical_loss": 3.7596845765052964, + "tokens_seen": 737286144 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039221664994984956, + "loss": 3.0643, + "theoretical_loss": 3.759650959131197, + "tokens_seen": 737351680 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039220661985957874, + "loss": 3.0923, + "theoretical_loss": 3.759617345581418, + "tokens_seen": 737417216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921965897693079, + "loss": 3.1508, + "theoretical_loss": 3.759583735855185, + "tokens_seen": 737482752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921865596790371, + "loss": 2.8939, + "theoretical_loss": 3.759550129951723, + "tokens_seen": 737548288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039217652958876633, + "loss": 3.0439, + "theoretical_loss": 3.7595165278702583, + "tokens_seen": 737613824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039216649949849546, + "loss": 3.0454, + "theoretical_loss": 3.759482929610016, + "tokens_seen": 737679360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921564694082247, + "loss": 2.9256, + "theoretical_loss": 3.7594493351702223, + "tokens_seen": 737744896 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921464393179539, + "loss": 2.8981, + "theoretical_loss": 3.759415744550104, + "tokens_seen": 737810432 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039213640922768306, + "loss": 3.0897, + "theoretical_loss": 3.7593821577488864, + "tokens_seen": 737875968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039212637913741224, + "loss": 3.0151, + "theoretical_loss": 3.759348574765797, + "tokens_seen": 737941504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921163490471414, + "loss": 3.0164, + "theoretical_loss": 3.7593149956000627, + "tokens_seen": 738007040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003921063189568706, + "loss": 3.1676, + "theoretical_loss": 3.7592814202509106, + "tokens_seen": 738072576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039209628886659984, + "loss": 3.0582, + "theoretical_loss": 3.759247848717568, + "tokens_seen": 738138112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039208625877632896, + "loss": 2.8543, + "theoretical_loss": 3.7592142809992626, + "tokens_seen": 738203648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003920762286860582, + "loss": 2.9453, + "theoretical_loss": 3.759180717095222, + "tokens_seen": 738269184 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003920661985957873, + "loss": 3.0692, + "theoretical_loss": 3.7591471570046746, + "tokens_seen": 738334720 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039205616850551656, + "loss": 3.1145, + "theoretical_loss": 3.7591136007268484, + "tokens_seen": 738400256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039204613841524574, + "loss": 3.1727, + "theoretical_loss": 3.759080048260972, + "tokens_seen": 738465792 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003920361083249749, + "loss": 2.9537, + "theoretical_loss": 3.7590464996062747, + "tokens_seen": 738531328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003920260782347041, + "loss": 2.9533, + "theoretical_loss": 3.7590129547619844, + "tokens_seen": 738596864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003920160481444333, + "loss": 2.9039, + "theoretical_loss": 3.7589794137273307, + "tokens_seen": 738662400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039200601805416247, + "loss": 2.9335, + "theoretical_loss": 3.758945876501543, + "tokens_seen": 738727936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003919959879638917, + "loss": 3.0055, + "theoretical_loss": 3.7589123430838516, + "tokens_seen": 738793472 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039198595787362083, + "loss": 2.9139, + "theoretical_loss": 3.758878813473485, + "tokens_seen": 738859008 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1195656, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1519150733947754, + "objective/train/theoretical_loss": 3.758862050095809, + "objective/train/tokens_used": 759351776, + "theoretical_loss": 3.758862050095809, + "tokens_seen": 738891776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039197592778335007, + "loss": 3.1489, + "theoretical_loss": 3.7588452876696747, + "tokens_seen": 738924544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039196589769307925, + "loss": 3.0004, + "theoretical_loss": 3.75881176567165, + "tokens_seen": 738990080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039195586760280843, + "loss": 2.915, + "theoretical_loss": 3.758778247478642, + "tokens_seen": 739055616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003919458375125376, + "loss": 2.9579, + "theoretical_loss": 3.7587447330898813, + "tokens_seen": 739121152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003919358074222668, + "loss": 2.8448, + "theoretical_loss": 3.7587112225045987, + "tokens_seen": 739186688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039192577733199597, + "loss": 2.9591, + "theoretical_loss": 3.758677715722025, + "tokens_seen": 739252224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003919157472417252, + "loss": 2.8846, + "theoretical_loss": 3.758644212741392, + "tokens_seen": 739317760 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039190571715145433, + "loss": 3.1598, + "theoretical_loss": 3.758610713561932, + "tokens_seen": 739383296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039189568706118357, + "loss": 2.9764, + "theoretical_loss": 3.758577218182876, + "tokens_seen": 739448832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039188565697091275, + "loss": 3.041, + "theoretical_loss": 3.758543726603457, + "tokens_seen": 739514368 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039187562688064193, + "loss": 2.9797, + "theoretical_loss": 3.7585102388229057, + "tokens_seen": 739579904 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039186559679037117, + "loss": 3.1474, + "theoretical_loss": 3.7584767548404563, + "tokens_seen": 739645440 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003918555667001003, + "loss": 2.8608, + "theoretical_loss": 3.758443274655341, + "tokens_seen": 739710976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039184553660982953, + "loss": 2.9516, + "theoretical_loss": 3.758409798266792, + "tokens_seen": 739776512 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039183550651955866, + "loss": 2.9244, + "theoretical_loss": 3.7583763256740434, + "tokens_seen": 739842048 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003918254764292879, + "loss": 2.8762, + "theoretical_loss": 3.758342856876329, + "tokens_seen": 739907584 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039181544633901707, + "loss": 2.9515, + "theoretical_loss": 3.7583093918728805, + "tokens_seen": 739973120 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039180541624874625, + "loss": 2.9793, + "theoretical_loss": 3.7582759306629345, + "tokens_seen": 740038656 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039179538615847543, + "loss": 3.015, + "theoretical_loss": 3.7582424732457227, + "tokens_seen": 740104192 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039178535606820467, + "loss": 2.9152, + "theoretical_loss": 3.758209019620481, + "tokens_seen": 740169728 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003917753259779338, + "loss": 3.0668, + "theoretical_loss": 3.7581755697864434, + "tokens_seen": 740235264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039176529588766303, + "loss": 3.0262, + "theoretical_loss": 3.758142123742844, + "tokens_seen": 740300800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039175526579739216, + "loss": 2.9611, + "theoretical_loss": 3.758108681488919, + "tokens_seen": 740366336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003917452357071214, + "loss": 2.9714, + "theoretical_loss": 3.7580752430239026, + "tokens_seen": 740431872 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003917352056168506, + "loss": 2.8407, + "theoretical_loss": 3.7580418083470306, + "tokens_seen": 740497408 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1198392, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1805214881896973, + "objective/train/theoretical_loss": 3.7580250924289107, + "objective/train/tokens_used": 760990176, + "theoretical_loss": 3.7580250924289107, + "tokens_seen": 740530176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039172517552657976, + "loss": 3.0132, + "theoretical_loss": 3.7580083774575392, + "tokens_seen": 740562944 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039171514543630894, + "loss": 2.947, + "theoretical_loss": 3.7579749503546633, + "tokens_seen": 740628480 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003917051153460381, + "loss": 3.0088, + "theoretical_loss": 3.75794152703764, + "tokens_seen": 740694016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003916950852557673, + "loss": 2.9742, + "theoretical_loss": 3.7579081075057044, + "tokens_seen": 740759552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039168505516549653, + "loss": 2.874, + "theoretical_loss": 3.757874691758094, + "tokens_seen": 740825088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039167502507522566, + "loss": 2.8732, + "theoretical_loss": 3.757841279794045, + "tokens_seen": 740890624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003916649949849549, + "loss": 2.845, + "theoretical_loss": 3.757807871612795, + "tokens_seen": 740956160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003916549648946841, + "loss": 2.9724, + "theoretical_loss": 3.75777446721358, + "tokens_seen": 741021696 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039164493480441326, + "loss": 2.9691, + "theoretical_loss": 3.7577410665956394, + "tokens_seen": 741087232 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039163490471414244, + "loss": 3.106, + "theoretical_loss": 3.757707669758209, + "tokens_seen": 741152768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039163490471414244, + "loss": 2.8908, + "theoretical_loss": 3.7576742767005276, + "tokens_seen": 741218304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003916248746238716, + "loss": 2.9143, + "theoretical_loss": 3.7576408874218328, + "tokens_seen": 741283840 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003916148445336008, + "loss": 3.067, + "theoretical_loss": 3.7576075019213633, + "tokens_seen": 741349376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039160481444333004, + "loss": 2.9113, + "theoretical_loss": 3.757574120198357, + "tokens_seen": 741414912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039159478435305916, + "loss": 2.9934, + "theoretical_loss": 3.7575407422520537, + "tokens_seen": 741480448 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915847542627884, + "loss": 2.9994, + "theoretical_loss": 3.7575073680816917, + "tokens_seen": 741545984 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915747241725175, + "loss": 3.0816, + "theoretical_loss": 3.7574739976865095, + "tokens_seen": 741611520 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039156469408224676, + "loss": 3.0493, + "theoretical_loss": 3.757440631065748, + "tokens_seen": 741677056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039155466399197594, + "loss": 2.9799, + "theoretical_loss": 3.757407268218646, + "tokens_seen": 741742592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915446339017051, + "loss": 2.9979, + "theoretical_loss": 3.7573739091444436, + "tokens_seen": 741808128 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915346038114343, + "loss": 3.0223, + "theoretical_loss": 3.7573405538423805, + "tokens_seen": 741873664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915245737211635, + "loss": 3.086, + "theoretical_loss": 3.757307202311697, + "tokens_seen": 741939200 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039151454363089267, + "loss": 2.8982, + "theoretical_loss": 3.7572738545516344, + "tokens_seen": 742004736 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003915045135406219, + "loss": 2.9743, + "theoretical_loss": 3.7572405105614326, + "tokens_seen": 742070272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039149448345035103, + "loss": 3.0361, + "theoretical_loss": 3.757207170340333, + "tokens_seen": 742135808 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1201322, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2057783603668213, + "objective/train/theoretical_loss": 3.7571905016429588, + "objective/train/tokens_used": 762628576, + "theoretical_loss": 3.7571905016429588, + "tokens_seen": 742168576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039148445336008027, + "loss": 3.0666, + "theoretical_loss": 3.757173833887576, + "tokens_seen": 742201344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039147442326980945, + "loss": 2.8989, + "theoretical_loss": 3.7571405012024037, + "tokens_seen": 742266880 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039146439317953863, + "loss": 2.894, + "theoretical_loss": 3.7571071722840577, + "tokens_seen": 742332416 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003914543630892678, + "loss": 3.0241, + "theoretical_loss": 3.75707384713178, + "tokens_seen": 742397952 + }, + { + "epoch": 2.02, + "learning_rate": 0.000391444332998997, + "loss": 3.061, + "theoretical_loss": 3.757040525744812, + "tokens_seen": 742463488 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039143430290872617, + "loss": 2.9768, + "theoretical_loss": 3.7570072081223964, + "tokens_seen": 742529024 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003914242728184554, + "loss": 3.0635, + "theoretical_loss": 3.7569738942637754, + "tokens_seen": 742594560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039141424272818453, + "loss": 2.9318, + "theoretical_loss": 3.756940584168192, + "tokens_seen": 742660096 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039140421263791377, + "loss": 3.1994, + "theoretical_loss": 3.7569072778348893, + "tokens_seen": 742725632 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003913941825476429, + "loss": 2.9617, + "theoretical_loss": 3.75687397526311, + "tokens_seen": 742791168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039138415245737213, + "loss": 2.9108, + "theoretical_loss": 3.7568406764520974, + "tokens_seen": 742856704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003913741223671013, + "loss": 3.1298, + "theoretical_loss": 3.756807381401096, + "tokens_seen": 742922240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003913640922768305, + "loss": 3.0188, + "theoretical_loss": 3.7567740901093485, + "tokens_seen": 742987776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003913540621865597, + "loss": 2.854, + "theoretical_loss": 3.756740802576099, + "tokens_seen": 743053312 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039134403209628886, + "loss": 2.924, + "theoretical_loss": 3.7567075188005923, + "tokens_seen": 743118848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039133400200601804, + "loss": 2.9303, + "theoretical_loss": 3.7566742387820726, + "tokens_seen": 743184384 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039132397191574727, + "loss": 3.0948, + "theoretical_loss": 3.756640962519785, + "tokens_seen": 743249920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003913139418254764, + "loss": 2.8605, + "theoretical_loss": 3.7566076900129737, + "tokens_seen": 743315456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039130391173520563, + "loss": 2.8428, + "theoretical_loss": 3.7565744212608845, + "tokens_seen": 743380992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003912938816449348, + "loss": 2.952, + "theoretical_loss": 3.7565411562627613, + "tokens_seen": 743446528 + }, + { + "epoch": 2.02, + "learning_rate": 0.000391283851554664, + "loss": 3.1202, + "theoretical_loss": 3.756507895017852, + "tokens_seen": 743512064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003912738214643932, + "loss": 2.9858, + "theoretical_loss": 3.7564746375254003, + "tokens_seen": 743577600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039126379137412236, + "loss": 2.9337, + "theoretical_loss": 3.756441383784653, + "tokens_seen": 743643136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039125376128385154, + "loss": 2.9503, + "theoretical_loss": 3.7564081337948565, + "tokens_seen": 743708672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003912437311935808, + "loss": 3.11, + "theoretical_loss": 3.756374887555257, + "tokens_seen": 743774208 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1204150, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1690452098846436, + "objective/train/theoretical_loss": 3.7563582658415453, + "objective/train/tokens_used": 764266976, + "theoretical_loss": 3.7563582658415453, + "tokens_seen": 743806976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003912337011033099, + "loss": 2.9473, + "theoretical_loss": 3.756341645065101, + "tokens_seen": 743839744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039122367101303914, + "loss": 3.0255, + "theoretical_loss": 3.756308406323635, + "tokens_seen": 743905280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039121364092276826, + "loss": 2.9326, + "theoretical_loss": 3.756275171330107, + "tokens_seen": 743970816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003912036108324975, + "loss": 2.9648, + "theoretical_loss": 3.756241940083764, + "tokens_seen": 744036352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003911935807422267, + "loss": 3.0479, + "theoretical_loss": 3.7562087125838532, + "tokens_seen": 744101888 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039118355065195586, + "loss": 2.9596, + "theoretical_loss": 3.756175488829622, + "tokens_seen": 744167424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039117352056168504, + "loss": 2.7357, + "theoretical_loss": 3.7561422688203194, + "tokens_seen": 744232960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003911634904714143, + "loss": 2.9551, + "theoretical_loss": 3.7561090525551926, + "tokens_seen": 744298496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003911534603811434, + "loss": 3.0986, + "theoretical_loss": 3.756075840033491, + "tokens_seen": 744364032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039114343029087264, + "loss": 2.9663, + "theoretical_loss": 3.756042631254462, + "tokens_seen": 744429568 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003911334002006018, + "loss": 3.1455, + "theoretical_loss": 3.7560094262173553, + "tokens_seen": 744495104 + }, + { + "epoch": 2.02, + "learning_rate": 0.000391123370110331, + "loss": 3.0022, + "theoretical_loss": 3.7559762249214192, + "tokens_seen": 744560640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039111334002006024, + "loss": 2.9312, + "theoretical_loss": 3.7559430273659036, + "tokens_seen": 744626176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039110330992978936, + "loss": 2.9031, + "theoretical_loss": 3.7559098335500574, + "tokens_seen": 744691712 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003910932798395186, + "loss": 3.059, + "theoretical_loss": 3.7558766434731305, + "tokens_seen": 744757248 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039108324974924773, + "loss": 2.8813, + "theoretical_loss": 3.7558434571343735, + "tokens_seen": 744822784 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039107321965897696, + "loss": 2.9588, + "theoretical_loss": 3.7558102745330357, + "tokens_seen": 744888320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039106318956870614, + "loss": 2.9811, + "theoretical_loss": 3.7557770956683676, + "tokens_seen": 744953856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003910531594784353, + "loss": 2.9526, + "theoretical_loss": 3.7557439205396195, + "tokens_seen": 745019392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003910431293881645, + "loss": 2.9516, + "theoretical_loss": 3.7557107491460426, + "tokens_seen": 745084928 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003910330992978937, + "loss": 2.9033, + "theoretical_loss": 3.7556775814868875, + "tokens_seen": 745150464 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039102306920762287, + "loss": 2.9663, + "theoretical_loss": 3.7556444175614057, + "tokens_seen": 745216000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003910130391173521, + "loss": 3.0265, + "theoretical_loss": 3.755611257368849, + "tokens_seen": 745281536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039100300902708123, + "loss": 2.9884, + "theoretical_loss": 3.7555781009084677, + "tokens_seen": 745347072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039099297893681047, + "loss": 3.0202, + "theoretical_loss": 3.755544948179515, + "tokens_seen": 745412608 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1207077, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.044055461883545, + "objective/train/theoretical_loss": 3.7555283732140903, + "objective/train/tokens_used": 765905376, + "theoretical_loss": 3.7555283732140903, + "tokens_seen": 745445376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039098294884653965, + "loss": 3.0025, + "theoretical_loss": 3.7555117991812423, + "tokens_seen": 745478144 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039097291875626883, + "loss": 2.9158, + "theoretical_loss": 3.7554786539129017, + "tokens_seen": 745543680 + }, + { + "epoch": 2.02, + "learning_rate": 0.000390962888665998, + "loss": 2.9175, + "theoretical_loss": 3.755445512373746, + "tokens_seen": 745609216 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003909528585757272, + "loss": 2.9913, + "theoretical_loss": 3.7554123745630283, + "tokens_seen": 745674752 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039094282848545637, + "loss": 3.0464, + "theoretical_loss": 3.755379240480001, + "tokens_seen": 745740288 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003909327983951856, + "loss": 2.9629, + "theoretical_loss": 3.7553461101239174, + "tokens_seen": 745805824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039092276830491473, + "loss": 2.9493, + "theoretical_loss": 3.75531298349403, + "tokens_seen": 745871360 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039091273821464397, + "loss": 2.9984, + "theoretical_loss": 3.755279860589594, + "tokens_seen": 745936896 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003909027081243731, + "loss": 3.0817, + "theoretical_loss": 3.755246741409862, + "tokens_seen": 746002432 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039089267803410233, + "loss": 2.9857, + "theoretical_loss": 3.755213625954089, + "tokens_seen": 746067968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003908826479438315, + "loss": 3.1477, + "theoretical_loss": 3.7551805142215278, + "tokens_seen": 746133504 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003908726178535607, + "loss": 2.9285, + "theoretical_loss": 3.7551474062114334, + "tokens_seen": 746199040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003908625877632899, + "loss": 2.9475, + "theoretical_loss": 3.7551143019230615, + "tokens_seen": 746264576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039085255767301906, + "loss": 2.925, + "theoretical_loss": 3.755081201355665, + "tokens_seen": 746330112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039084252758274824, + "loss": 3.0294, + "theoretical_loss": 3.755048104508501, + "tokens_seen": 746395648 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039083249749247747, + "loss": 2.9979, + "theoretical_loss": 3.755015011380823, + "tokens_seen": 746461184 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003908224674022066, + "loss": 3.0141, + "theoretical_loss": 3.754981921971888, + "tokens_seen": 746526720 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039081243731193583, + "loss": 3.097, + "theoretical_loss": 3.7549488362809504, + "tokens_seen": 746592256 + }, + { + "epoch": 2.02, + "learning_rate": 0.000390802407221665, + "loss": 2.8439, + "theoretical_loss": 3.754915754307267, + "tokens_seen": 746657792 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003907923771313942, + "loss": 2.9692, + "theoretical_loss": 3.7548826760500935, + "tokens_seen": 746723328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003907823470411234, + "loss": 3.1321, + "theoretical_loss": 3.7548496015086874, + "tokens_seen": 746788864 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039077231695085256, + "loss": 2.9378, + "theoretical_loss": 3.7548165306823034, + "tokens_seen": 746854400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039076228686058174, + "loss": 2.9321, + "theoretical_loss": 3.754783463570199, + "tokens_seen": 746919936 + }, + { + "epoch": 2.02, + "learning_rate": 0.000390752256770311, + "loss": 3.0132, + "theoretical_loss": 3.7547504001716314, + "tokens_seen": 746985472 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003907422266800401, + "loss": 2.9307, + "theoretical_loss": 3.7547173404858585, + "tokens_seen": 747051008 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1209842, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9305930137634277, + "objective/train/theoretical_loss": 3.7547008120350376, + "objective/train/tokens_used": 767543776, + "theoretical_loss": 3.7547008120350376, + "tokens_seen": 747083776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039073219658976934, + "loss": 2.888, + "theoretical_loss": 3.7546842845121366, + "tokens_seen": 747116544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039072216649949846, + "loss": 3.0373, + "theoretical_loss": 3.754651232249724, + "tokens_seen": 747182080 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003907121364092277, + "loss": 2.946, + "theoretical_loss": 3.754618183697878, + "tokens_seen": 747247616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003907021063189569, + "loss": 2.898, + "theoretical_loss": 3.754585138855857, + "tokens_seen": 747313152 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039069207622868606, + "loss": 3.0142, + "theoretical_loss": 3.754552097722919, + "tokens_seen": 747378688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039068204613841524, + "loss": 3.0458, + "theoretical_loss": 3.7545190602983234, + "tokens_seen": 747444224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003906720160481445, + "loss": 2.9139, + "theoretical_loss": 3.754486026581328, + "tokens_seen": 747509760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003906619859578736, + "loss": 2.9537, + "theoretical_loss": 3.7544529965711915, + "tokens_seen": 747575296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039065195586760284, + "loss": 2.9049, + "theoretical_loss": 3.754419970267174, + "tokens_seen": 747640832 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039064192577733197, + "loss": 2.8327, + "theoretical_loss": 3.754386947668534, + "tokens_seen": 747706368 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003906318956870612, + "loss": 3.0832, + "theoretical_loss": 3.754353928774532, + "tokens_seen": 747771904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003906218655967904, + "loss": 2.9925, + "theoretical_loss": 3.7543209135844267, + "tokens_seen": 747837440 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039061183550651957, + "loss": 3.0345, + "theoretical_loss": 3.754287902097478, + "tokens_seen": 747902976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039060180541624875, + "loss": 2.9851, + "theoretical_loss": 3.7542548943129477, + "tokens_seen": 747968512 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039059177532597793, + "loss": 2.8701, + "theoretical_loss": 3.7542218902300943, + "tokens_seen": 748034048 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003905817452357071, + "loss": 2.8562, + "theoretical_loss": 3.7541888898481797, + "tokens_seen": 748099584 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039057171514543634, + "loss": 2.9242, + "theoretical_loss": 3.7541558931664643, + "tokens_seen": 748165120 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039056168505516547, + "loss": 2.8132, + "theoretical_loss": 3.7541229001842096, + "tokens_seen": 748230656 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003905516549648947, + "loss": 2.8961, + "theoretical_loss": 3.7540899109006753, + "tokens_seen": 748296192 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039054162487462383, + "loss": 2.9666, + "theoretical_loss": 3.7540569253151244, + "tokens_seen": 748361728 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039053159478435307, + "loss": 2.964, + "theoretical_loss": 3.7540239434268186, + "tokens_seen": 748427264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039052156469408225, + "loss": 2.8562, + "theoretical_loss": 3.753990965235019, + "tokens_seen": 748492800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039051153460381143, + "loss": 2.8804, + "theoretical_loss": 3.7539579907389884, + "tokens_seen": 748558336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003905015045135406, + "loss": 2.7883, + "theoretical_loss": 3.7539250199379888, + "tokens_seen": 748623872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039049147442326985, + "loss": 3.0785, + "theoretical_loss": 3.753892052831282, + "tokens_seen": 748689408 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1211281, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6608660221099854, + "objective/train/theoretical_loss": 3.753875570663058, + "objective/train/tokens_used": 769182176, + "theoretical_loss": 3.753875570663058, + "tokens_seen": 748722176 + }, + { + "epoch": 2.02, + "learning_rate": 0.000390481444332999, + "loss": 2.8648, + "theoretical_loss": 3.753859089418132, + "tokens_seen": 748754944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003904714142427282, + "loss": 3.0608, + "theoretical_loss": 3.753826129697801, + "tokens_seen": 748820480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039046138415245734, + "loss": 2.9658, + "theoretical_loss": 3.753793173669552, + "tokens_seen": 748886016 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039045135406218657, + "loss": 2.8583, + "theoretical_loss": 3.7537602213326493, + "tokens_seen": 748951552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039044132397191575, + "loss": 2.8975, + "theoretical_loss": 3.7537272726863558, + "tokens_seen": 749017088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039043129388164493, + "loss": 2.8692, + "theoretical_loss": 3.753694327729935, + "tokens_seen": 749082624 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003904212637913741, + "loss": 2.9435, + "theoretical_loss": 3.753661386462652, + "tokens_seen": 749148160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003904112337011033, + "loss": 2.9416, + "theoretical_loss": 3.75362844888377, + "tokens_seen": 749213696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003904012036108325, + "loss": 2.9706, + "theoretical_loss": 3.7535955149925537, + "tokens_seen": 749279232 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903911735205617, + "loss": 2.9234, + "theoretical_loss": 3.753562584788268, + "tokens_seen": 749344768 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903811434302909, + "loss": 2.9965, + "theoretical_loss": 3.7535296582701774, + "tokens_seen": 749410304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903711133400201, + "loss": 2.9687, + "theoretical_loss": 3.7534967354375475, + "tokens_seen": 749475840 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039036108324974926, + "loss": 3.0181, + "theoretical_loss": 3.7534638162896425, + "tokens_seen": 749541376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039035105315947844, + "loss": 2.8533, + "theoretical_loss": 3.7534309008257294, + "tokens_seen": 749606912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039034102306920767, + "loss": 3.0298, + "theoretical_loss": 3.7533979890450726, + "tokens_seen": 749672448 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903309929789368, + "loss": 2.9555, + "theoretical_loss": 3.7533650809469385, + "tokens_seen": 749737984 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039032096288866603, + "loss": 3.1138, + "theoretical_loss": 3.7533321765305936, + "tokens_seen": 749803520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903109327983952, + "loss": 3.1032, + "theoretical_loss": 3.753299275795303, + "tokens_seen": 749869056 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003903009027081244, + "loss": 2.9954, + "theoretical_loss": 3.753266378740335, + "tokens_seen": 749934592 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003902908726178536, + "loss": 2.9935, + "theoretical_loss": 3.753233485364955, + "tokens_seen": 750000128 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039028084252758276, + "loss": 2.9978, + "theoretical_loss": 3.753200595668431, + "tokens_seen": 750065664 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039027081243731194, + "loss": 3.155, + "theoretical_loss": 3.753167709650029, + "tokens_seen": 750131200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003902607823470412, + "loss": 3.0258, + "theoretical_loss": 3.753134827309017, + "tokens_seen": 750196736 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003902507522567703, + "loss": 2.9158, + "theoretical_loss": 3.7531019486446624, + "tokens_seen": 750262272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039024072216649954, + "loss": 2.9328, + "theoretical_loss": 3.753069073656234, + "tokens_seen": 750327808 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1215179, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8630805015563965, + "objective/train/theoretical_loss": 3.7530526375402626, + "objective/train/tokens_used": 770820576, + "theoretical_loss": 3.7530526375402626, + "tokens_seen": 750360576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039023069207622866, + "loss": 2.9345, + "theoretical_loss": 3.753036202342998, + "tokens_seen": 750393344 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003902206619859579, + "loss": 2.8847, + "theoretical_loss": 3.753003334704224, + "tokens_seen": 750458880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003902106318956871, + "loss": 3.0384, + "theoretical_loss": 3.7529704707391796, + "tokens_seen": 750524416 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039020060180541626, + "loss": 2.9585, + "theoretical_loss": 3.752937610447134, + "tokens_seen": 750589952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039019057171514544, + "loss": 2.8799, + "theoretical_loss": 3.752904753827356, + "tokens_seen": 750655488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003901805416248747, + "loss": 2.9359, + "theoretical_loss": 3.7528719008791143, + "tokens_seen": 750721024 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003901705115346038, + "loss": 2.9109, + "theoretical_loss": 3.7528390516016783, + "tokens_seen": 750786560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039016048144433304, + "loss": 2.9376, + "theoretical_loss": 3.752806205994318, + "tokens_seen": 750852096 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039015045135406217, + "loss": 2.9303, + "theoretical_loss": 3.752773364056303, + "tokens_seen": 750917632 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003901404212637914, + "loss": 2.9267, + "theoretical_loss": 3.752740525786902, + "tokens_seen": 750983168 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003901303911735206, + "loss": 3.1094, + "theoretical_loss": 3.7527076911853863, + "tokens_seen": 751048704 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039012036108324977, + "loss": 2.8119, + "theoretical_loss": 3.7526748602510254, + "tokens_seen": 751114240 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039011033099297895, + "loss": 3.0966, + "theoretical_loss": 3.7526420329830907, + "tokens_seen": 751179776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039010030090270813, + "loss": 3.0377, + "theoretical_loss": 3.7526092093808527, + "tokens_seen": 751245312 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003900902708124373, + "loss": 3.1009, + "theoretical_loss": 3.7525763894435817, + "tokens_seen": 751310848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039008024072216654, + "loss": 3.0425, + "theoretical_loss": 3.7525435731705494, + "tokens_seen": 751376384 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039007021063189567, + "loss": 3.0359, + "theoretical_loss": 3.752510760561027, + "tokens_seen": 751441920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003900601805416249, + "loss": 2.9037, + "theoretical_loss": 3.7524779516142868, + "tokens_seen": 751507456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039005015045135403, + "loss": 3.0182, + "theoretical_loss": 3.752445146329599, + "tokens_seen": 751572992 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039004012036108327, + "loss": 2.9458, + "theoretical_loss": 3.7524123447062365, + "tokens_seen": 751638528 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039003009027081245, + "loss": 2.8281, + "theoretical_loss": 3.7523795467434717, + "tokens_seen": 751704064 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039002006018054163, + "loss": 3.1118, + "theoretical_loss": 3.7523467524405767, + "tokens_seen": 751769600 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003900100300902708, + "loss": 3.0284, + "theoretical_loss": 3.7523139617968244, + "tokens_seen": 751835136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00039000000000000005, + "loss": 3.0957, + "theoretical_loss": 3.752281174811487, + "tokens_seen": 751900672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003899899699097292, + "loss": 3.2553, + "theoretical_loss": 3.752248391483838, + "tokens_seen": 751966208 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1216494, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9882922172546387, + "objective/train/theoretical_loss": 3.7522320011914196, + "objective/train/tokens_used": 772458976, + "theoretical_loss": 3.7522320011914196, + "tokens_seen": 751998976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003899799398194584, + "loss": 2.8418, + "theoretical_loss": 3.7522156118131504, + "tokens_seen": 752031744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038996990972918754, + "loss": 2.961, + "theoretical_loss": 3.7521828357986977, + "tokens_seen": 752097280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038995987963891677, + "loss": 3.0949, + "theoretical_loss": 3.7521500634397533, + "tokens_seen": 752162816 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038994984954864595, + "loss": 2.9724, + "theoretical_loss": 3.752117294735592, + "tokens_seen": 752228352 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038993981945837513, + "loss": 3.0842, + "theoretical_loss": 3.7520845296854866, + "tokens_seen": 752293888 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003899297893681043, + "loss": 3.0908, + "theoretical_loss": 3.7520517682887125, + "tokens_seen": 752359424 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003899197592778335, + "loss": 3.0174, + "theoretical_loss": 3.752019010544543, + "tokens_seen": 752424960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003899097291875627, + "loss": 3.1172, + "theoretical_loss": 3.7519862564522537, + "tokens_seen": 752490496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898996990972919, + "loss": 3.058, + "theoretical_loss": 3.751953506011119, + "tokens_seen": 752556032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038988966900702104, + "loss": 2.931, + "theoretical_loss": 3.7519207592204147, + "tokens_seen": 752621568 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898796389167503, + "loss": 3.0689, + "theoretical_loss": 3.751888016079415, + "tokens_seen": 752687104 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898696088264794, + "loss": 3.0791, + "theoretical_loss": 3.751855276587396, + "tokens_seen": 752752640 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038985957873620864, + "loss": 2.8788, + "theoretical_loss": 3.7518225407436336, + "tokens_seen": 752818176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898495486459378, + "loss": 2.9119, + "theoretical_loss": 3.7517898085474037, + "tokens_seen": 752883712 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389839518555667, + "loss": 3.0455, + "theoretical_loss": 3.751757079997982, + "tokens_seen": 752949248 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898294884653962, + "loss": 2.9127, + "theoretical_loss": 3.7517243550946446, + "tokens_seen": 753014784 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003898194583751254, + "loss": 2.9616, + "theoretical_loss": 3.7516916338366695, + "tokens_seen": 753080320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038980942828485454, + "loss": 3.0041, + "theoretical_loss": 3.751658916223332, + "tokens_seen": 753145856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897993981945838, + "loss": 2.9898, + "theoretical_loss": 3.7516262022539095, + "tokens_seen": 753211392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897893681043129, + "loss": 2.9859, + "theoretical_loss": 3.7515934919276788, + "tokens_seen": 753276928 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038977933801404214, + "loss": 3.017, + "theoretical_loss": 3.751560785243918, + "tokens_seen": 753342464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897693079237713, + "loss": 2.84, + "theoretical_loss": 3.7515280822019044, + "tokens_seen": 753408000 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897592778335005, + "loss": 2.8758, + "theoretical_loss": 3.7514953828009157, + "tokens_seen": 753473536 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897492477432297, + "loss": 2.9576, + "theoretical_loss": 3.7514626870402297, + "tokens_seen": 753539072 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038973921765295886, + "loss": 2.8275, + "theoretical_loss": 3.7514299949191248, + "tokens_seen": 753604608 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1219211, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8871450424194336, + "objective/train/theoretical_loss": 3.7514136502231894, + "objective/train/tokens_used": 774097376, + "theoretical_loss": 3.7514136502231894, + "tokens_seen": 753637376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038972918756268805, + "loss": 2.8685, + "theoretical_loss": 3.7513973064368793, + "tokens_seen": 753670144 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897191574724173, + "loss": 2.9722, + "theoretical_loss": 3.751364621592772, + "tokens_seen": 753735680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003897091273821464, + "loss": 2.8381, + "theoretical_loss": 3.7513319403860814, + "tokens_seen": 753801216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038969909729187564, + "loss": 2.9302, + "theoretical_loss": 3.7512992628160875, + "tokens_seen": 753866752 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038968906720160477, + "loss": 2.9519, + "theoretical_loss": 3.7512665888820678, + "tokens_seen": 753932288 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389679037111334, + "loss": 3.0663, + "theoretical_loss": 3.751233918583303, + "tokens_seen": 753997824 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003896690070210632, + "loss": 2.9979, + "theoretical_loss": 3.7512012519190723, + "tokens_seen": 754063360 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038965897693079237, + "loss": 2.9886, + "theoretical_loss": 3.7511685888886555, + "tokens_seen": 754128896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038964894684052155, + "loss": 2.8433, + "theoretical_loss": 3.751135929491333, + "tokens_seen": 754194432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003896389167502508, + "loss": 3.0536, + "theoretical_loss": 3.7511032737263843, + "tokens_seen": 754259968 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038962888665997997, + "loss": 2.9872, + "theoretical_loss": 3.7510706215930907, + "tokens_seen": 754325504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038961885656970915, + "loss": 2.9628, + "theoretical_loss": 3.7510379730907326, + "tokens_seen": 754391040 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038960882647943833, + "loss": 3.1345, + "theoretical_loss": 3.7510053282185907, + "tokens_seen": 754456576 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003895987963891675, + "loss": 2.9855, + "theoretical_loss": 3.7509726869759463, + "tokens_seen": 754522112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038958876629889674, + "loss": 3.0294, + "theoretical_loss": 3.75094004936208, + "tokens_seen": 754587648 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038957873620862587, + "loss": 3.0072, + "theoretical_loss": 3.750907415376274, + "tokens_seen": 754653184 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003895687061183551, + "loss": 3.0879, + "theoretical_loss": 3.75087478501781, + "tokens_seen": 754718720 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038955867602808423, + "loss": 2.9063, + "theoretical_loss": 3.7508421582859697, + "tokens_seen": 754784256 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038954864593781347, + "loss": 2.971, + "theoretical_loss": 3.750809535180035, + "tokens_seen": 754849792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038953861584754265, + "loss": 3.1913, + "theoretical_loss": 3.7507769156992876, + "tokens_seen": 754915328 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038952858575727183, + "loss": 3.0891, + "theoretical_loss": 3.7507442998430114, + "tokens_seen": 754980864 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389518555667001, + "loss": 2.9375, + "theoretical_loss": 3.7507116876104885, + "tokens_seen": 755046400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038950852557673025, + "loss": 2.9555, + "theoretical_loss": 3.7506790790010016, + "tokens_seen": 755111936 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894984954864594, + "loss": 3.08, + "theoretical_loss": 3.750646474013834, + "tokens_seen": 755177472 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894884653961886, + "loss": 2.9337, + "theoretical_loss": 3.7506138726482687, + "tokens_seen": 755243008 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1221987, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0872912406921387, + "objective/train/theoretical_loss": 3.7505975733233634, + "objective/train/tokens_used": 775735776, + "theoretical_loss": 3.7505975733233634, + "tokens_seen": 755275776 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038947843530591774, + "loss": 3.0271, + "theoretical_loss": 3.75058127490359, + "tokens_seen": 755308544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038946840521564697, + "loss": 2.9579, + "theoretical_loss": 3.7505486807790804, + "tokens_seen": 755374080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038945837512537615, + "loss": 2.9158, + "theoretical_loss": 3.750516090274025, + "tokens_seen": 755439616 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038944834503510533, + "loss": 3.0076, + "theoretical_loss": 3.7504835033877075, + "tokens_seen": 755505152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894383149448345, + "loss": 2.9431, + "theoretical_loss": 3.7504509201194116, + "tokens_seen": 755570688 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894282848545637, + "loss": 2.919, + "theoretical_loss": 3.750418340468423, + "tokens_seen": 755636224 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894182547642929, + "loss": 2.9881, + "theoretical_loss": 3.750385764434026, + "tokens_seen": 755701760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003894082246740221, + "loss": 2.9912, + "theoretical_loss": 3.7503531920155053, + "tokens_seen": 755767296 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038939819458375124, + "loss": 2.7768, + "theoretical_loss": 3.7503206232121453, + "tokens_seen": 755832832 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003893881644934805, + "loss": 2.9153, + "theoretical_loss": 3.7502880580232336, + "tokens_seen": 755898368 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003893781344032096, + "loss": 2.917, + "theoretical_loss": 3.7502554964480534, + "tokens_seen": 755963904 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038936810431293884, + "loss": 2.9622, + "theoretical_loss": 3.7502229384858916, + "tokens_seen": 756029440 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389358074222668, + "loss": 3.02, + "theoretical_loss": 3.750190384136034, + "tokens_seen": 756094976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003893480441323972, + "loss": 2.9655, + "theoretical_loss": 3.750157833397767, + "tokens_seen": 756160512 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003893380140421264, + "loss": 3.0068, + "theoretical_loss": 3.750125286270377, + "tokens_seen": 756226048 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003893279839518556, + "loss": 2.8778, + "theoretical_loss": 3.750092742753149, + "tokens_seen": 756291584 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038931795386158474, + "loss": 2.8788, + "theoretical_loss": 3.750060202845372, + "tokens_seen": 756357120 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389307923771314, + "loss": 3.0557, + "theoretical_loss": 3.750027666546332, + "tokens_seen": 756422656 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892978936810431, + "loss": 2.968, + "theoretical_loss": 3.7499951338553164, + "tokens_seen": 756488192 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038928786359077234, + "loss": 3.0152, + "theoretical_loss": 3.7499626047716124, + "tokens_seen": 756553728 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892778335005015, + "loss": 3.0391, + "theoretical_loss": 3.749930079294507, + "tokens_seen": 756619264 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892678034102307, + "loss": 2.8793, + "theoretical_loss": 3.7498975574232887, + "tokens_seen": 756684800 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892577733199599, + "loss": 2.8612, + "theoretical_loss": 3.7498650391572452, + "tokens_seen": 756750336 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038924774322968906, + "loss": 3.0558, + "theoretical_loss": 3.749832524495665, + "tokens_seen": 756815872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038923771313941825, + "loss": 2.7883, + "theoretical_loss": 3.749800013437837, + "tokens_seen": 756881408 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1224816, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1574177742004395, + "objective/train/theoretical_loss": 3.749783759260107, + "objective/train/tokens_used": 777374176, + "theoretical_loss": 3.749783759260107, + "tokens_seen": 756914176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892276830491475, + "loss": 2.9432, + "theoretical_loss": 3.7497675059830486, + "tokens_seen": 756946944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003892176529588766, + "loss": 2.9278, + "theoretical_loss": 3.7497350021305893, + "tokens_seen": 757012480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038920762286860584, + "loss": 2.9697, + "theoretical_loss": 3.7497025018797476, + "tokens_seen": 757078016 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038919759277833497, + "loss": 3.0845, + "theoretical_loss": 3.749670005229813, + "tokens_seen": 757143552 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003891875626880642, + "loss": 3.0734, + "theoretical_loss": 3.749637512180075, + "tokens_seen": 757209088 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003891775325977934, + "loss": 2.8599, + "theoretical_loss": 3.749605022729823, + "tokens_seen": 757274624 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038916750250752257, + "loss": 2.9814, + "theoretical_loss": 3.749572536878347, + "tokens_seen": 757340160 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038915747241725175, + "loss": 3.0714, + "theoretical_loss": 3.749540054624937, + "tokens_seen": 757405696 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389147442326981, + "loss": 3.0187, + "theoretical_loss": 3.7495075759688836, + "tokens_seen": 757471232 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003891374122367101, + "loss": 2.968, + "theoretical_loss": 3.7494751009094758, + "tokens_seen": 757536768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038912738214643935, + "loss": 2.7579, + "theoretical_loss": 3.7494426294460057, + "tokens_seen": 757602304 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003891173520561685, + "loss": 2.9152, + "theoretical_loss": 3.7494101615777637, + "tokens_seen": 757667840 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003891073219658977, + "loss": 2.8879, + "theoretical_loss": 3.7493776973040402, + "tokens_seen": 757733376 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003890972918756269, + "loss": 2.8944, + "theoretical_loss": 3.7493452366241273, + "tokens_seen": 757798912 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038908726178535607, + "loss": 2.8812, + "theoretical_loss": 3.7493127795373162, + "tokens_seen": 757864448 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038907723169508525, + "loss": 3.0475, + "theoretical_loss": 3.749280326042898, + "tokens_seen": 757929984 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038906720160481443, + "loss": 2.916, + "theoretical_loss": 3.7492478761401644, + "tokens_seen": 757995520 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003890571715145436, + "loss": 2.9745, + "theoretical_loss": 3.7492154298284084, + "tokens_seen": 758061056 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038904714142427285, + "loss": 3.0267, + "theoretical_loss": 3.7491829871069218, + "tokens_seen": 758126592 + }, + { + "epoch": 2.02, + "learning_rate": 0.000389037111334002, + "loss": 2.7915, + "theoretical_loss": 3.7491505479749967, + "tokens_seen": 758192128 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003890270812437312, + "loss": 3.0619, + "theoretical_loss": 3.7491181124319257, + "tokens_seen": 758257664 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003890170511534604, + "loss": 2.9143, + "theoretical_loss": 3.749085680477002, + "tokens_seen": 758323200 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003890070210631896, + "loss": 3.0127, + "theoretical_loss": 3.7490532521095186, + "tokens_seen": 758388736 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038899699097291876, + "loss": 3.0368, + "theoretical_loss": 3.7490208273287684, + "tokens_seen": 758454272 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038898696088264794, + "loss": 3.0488, + "theoretical_loss": 3.7489884061340453, + "tokens_seen": 758519808 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1227511, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.896862506866455, + "objective/train/theoretical_loss": 3.7489721968812226, + "objective/train/tokens_used": 779012576, + "theoretical_loss": 3.7489721968812226, + "tokens_seen": 758552576 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003889769307923771, + "loss": 2.9428, + "theoretical_loss": 3.748955988524642, + "tokens_seen": 758585344 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038896690070210635, + "loss": 3.0736, + "theoretical_loss": 3.7489235744998535, + "tokens_seen": 758650880 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003889568706118355, + "loss": 2.9964, + "theoretical_loss": 3.748891164058973, + "tokens_seen": 758716416 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003889468405215647, + "loss": 3.0223, + "theoretical_loss": 3.748858757201295, + "tokens_seen": 758781952 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038893681043129384, + "loss": 2.8971, + "theoretical_loss": 3.7488263539261144, + "tokens_seen": 758847488 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003889267803410231, + "loss": 2.8842, + "theoretical_loss": 3.748793954232725, + "tokens_seen": 758913024 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038891675025075226, + "loss": 3.0118, + "theoretical_loss": 3.7487615581204214, + "tokens_seen": 758978560 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038890672016048144, + "loss": 2.9508, + "theoretical_loss": 3.7487291655885, + "tokens_seen": 759044096 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888966900702106, + "loss": 2.9483, + "theoretical_loss": 3.7486967766362547, + "tokens_seen": 759109632 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888866599799398, + "loss": 2.9272, + "theoretical_loss": 3.7486643912629813, + "tokens_seen": 759175168 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038887662988966904, + "loss": 3.0122, + "theoretical_loss": 3.748632009467976, + "tokens_seen": 759240704 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888665997993982, + "loss": 2.9965, + "theoretical_loss": 3.748599631250534, + "tokens_seen": 759306240 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888565697091274, + "loss": 2.9567, + "theoretical_loss": 3.748567256609951, + "tokens_seen": 759371776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888465396188566, + "loss": 2.9609, + "theoretical_loss": 3.7485348855455243, + "tokens_seen": 759437312 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888365095285858, + "loss": 2.9266, + "theoretical_loss": 3.74850251805655, + "tokens_seen": 759502848 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038882647943831494, + "loss": 2.6972, + "theoretical_loss": 3.748470154142323, + "tokens_seen": 759568384 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888164493480442, + "loss": 3.0126, + "theoretical_loss": 3.748437793802143, + "tokens_seen": 759633920 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003888064192577733, + "loss": 2.878, + "theoretical_loss": 3.748405437035305, + "tokens_seen": 759699456 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038879638916750254, + "loss": 3.0736, + "theoretical_loss": 3.7483730838411065, + "tokens_seen": 759764992 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003887863590772317, + "loss": 2.939, + "theoretical_loss": 3.7483407342188455, + "tokens_seen": 759830528 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003887763289869609, + "loss": 2.8119, + "theoretical_loss": 3.7483083881678194, + "tokens_seen": 759896064 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003887662988966901, + "loss": 2.9301, + "theoretical_loss": 3.7482760456873256, + "tokens_seen": 759961600 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038875626880641927, + "loss": 3.0739, + "theoretical_loss": 3.7482437067766625, + "tokens_seen": 760027136 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038874623871614845, + "loss": 3.0768, + "theoretical_loss": 3.7482113714351284, + "tokens_seen": 760092672 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003887362086258777, + "loss": 3.0444, + "theoretical_loss": 3.7481790396620216, + "tokens_seen": 760158208 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1230181, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.136302947998047, + "objective/train/theoretical_loss": 3.748162875113409, + "objective/train/tokens_used": 780650976, + "theoretical_loss": 3.748162875113409, + "tokens_seen": 760190976 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003887261785356068, + "loss": 2.9394, + "theoretical_loss": 3.7481467114566405, + "tokens_seen": 760223744 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038871614844533604, + "loss": 2.9389, + "theoretical_loss": 3.748114386818284, + "tokens_seen": 760289280 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038870611835506517, + "loss": 2.9868, + "theoretical_loss": 3.7480820657462512, + "tokens_seen": 760354816 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886960882647944, + "loss": 3.0064, + "theoretical_loss": 3.7480497482398407, + "tokens_seen": 760420352 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886860581745236, + "loss": 2.9276, + "theoretical_loss": 3.7480174342983528, + "tokens_seen": 760485888 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038867602808425277, + "loss": 3.0619, + "theoretical_loss": 3.7479851239210866, + "tokens_seen": 760551424 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038866599799398195, + "loss": 2.849, + "theoretical_loss": 3.7479528171073424, + "tokens_seen": 760616960 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886559679037112, + "loss": 2.9831, + "theoretical_loss": 3.7479205138564193, + "tokens_seen": 760682496 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886459378134403, + "loss": 3.1336, + "theoretical_loss": 3.7478882141676184, + "tokens_seen": 760748032 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038863590772316955, + "loss": 2.9767, + "theoretical_loss": 3.7478559180402393, + "tokens_seen": 760813568 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886258776328987, + "loss": 2.9837, + "theoretical_loss": 3.747823625473583, + "tokens_seen": 760879104 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886158475426279, + "loss": 2.9779, + "theoretical_loss": 3.7477913364669506, + "tokens_seen": 760944640 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003886058174523571, + "loss": 3.0022, + "theoretical_loss": 3.747759051019642, + "tokens_seen": 761010176 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038859578736208627, + "loss": 2.9985, + "theoretical_loss": 3.74772676913096, + "tokens_seen": 761075712 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038858575727181545, + "loss": 3.0318, + "theoretical_loss": 3.747694490800204, + "tokens_seen": 761141248 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038857572718154463, + "loss": 2.815, + "theoretical_loss": 3.7476622160266775, + "tokens_seen": 761206784 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003885656970912738, + "loss": 3.035, + "theoretical_loss": 3.747629944809681, + "tokens_seen": 761272320 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038855566700100305, + "loss": 2.8778, + "theoretical_loss": 3.747597677148517, + "tokens_seen": 761337856 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003885456369107322, + "loss": 3.0603, + "theoretical_loss": 3.7475654130424876, + "tokens_seen": 761403392 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003885356068204614, + "loss": 3.0468, + "theoretical_loss": 3.7475331524908952, + "tokens_seen": 761468928 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003885255767301906, + "loss": 3.1154, + "theoretical_loss": 3.747500895493042, + "tokens_seen": 761534464 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003885155466399198, + "loss": 3.0014, + "theoretical_loss": 3.747468642048231, + "tokens_seen": 761600000 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038850551654964896, + "loss": 2.8877, + "theoretical_loss": 3.747436392155765, + "tokens_seen": 761665536 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038849548645937814, + "loss": 2.8706, + "theoretical_loss": 3.747404145814947, + "tokens_seen": 761731072 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003884854563691073, + "loss": 2.8492, + "theoretical_loss": 3.7473719030250816, + "tokens_seen": 761796608 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1231594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.808631658554077, + "objective/train/theoretical_loss": 3.7473557829615376, + "objective/train/tokens_used": 782289376, + "theoretical_loss": 3.7473557829615376, + "tokens_seen": 761829376 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038847542627883655, + "loss": 2.9115, + "theoretical_loss": 3.747339663785471, + "tokens_seen": 761862144 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003884653961885657, + "loss": 2.9807, + "theoretical_loss": 3.747307428095419, + "tokens_seen": 761927680 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003884553660982949, + "loss": 3.0745, + "theoretical_loss": 3.74727519595423, + "tokens_seen": 761993216 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038844533600802404, + "loss": 3.0916, + "theoretical_loss": 3.747242967361208, + "tokens_seen": 762058752 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003884353059177533, + "loss": 2.9524, + "theoretical_loss": 3.747210742315657, + "tokens_seen": 762124288 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038842527582748246, + "loss": 3.0985, + "theoretical_loss": 3.747178520816882, + "tokens_seen": 762189824 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038841524573721164, + "loss": 2.9215, + "theoretical_loss": 3.747146302864188, + "tokens_seen": 762255360 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003884052156469408, + "loss": 3.0853, + "theoretical_loss": 3.7471140884568785, + "tokens_seen": 762320896 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038839518555667, + "loss": 3.0755, + "theoretical_loss": 3.74708187759426, + "tokens_seen": 762386432 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883851554663992, + "loss": 2.9351, + "theoretical_loss": 3.7470496702756373, + "tokens_seen": 762451968 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883751253761284, + "loss": 2.8692, + "theoretical_loss": 3.747017466500316, + "tokens_seen": 762517504 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038836509528585755, + "loss": 3.0365, + "theoretical_loss": 3.746985266267602, + "tokens_seen": 762583040 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883550651955868, + "loss": 3.0341, + "theoretical_loss": 3.7469530695768007, + "tokens_seen": 762648576 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038834503510531596, + "loss": 2.7601, + "theoretical_loss": 3.746920876427219, + "tokens_seen": 762714112 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038833500501504514, + "loss": 2.9659, + "theoretical_loss": 3.7468886868181617, + "tokens_seen": 762779648 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883249749247743, + "loss": 3.0072, + "theoretical_loss": 3.7468565007489367, + "tokens_seen": 762845184 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883149448345035, + "loss": 2.8727, + "theoretical_loss": 3.74682431821885, + "tokens_seen": 762910720 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003883049147442327, + "loss": 2.9236, + "theoretical_loss": 3.7467921392272086, + "tokens_seen": 762976256 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003882948846539619, + "loss": 2.9577, + "theoretical_loss": 3.7467599637733198, + "tokens_seen": 763041792 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038828485456369105, + "loss": 2.8079, + "theoretical_loss": 3.7467277918564905, + "tokens_seen": 763107328 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003882748244734203, + "loss": 3.0695, + "theoretical_loss": 3.746695623476029, + "tokens_seen": 763172864 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003882647943831494, + "loss": 3.0324, + "theoretical_loss": 3.7466634586312413, + "tokens_seen": 763238400 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038825476429287865, + "loss": 2.903, + "theoretical_loss": 3.7466312973214366, + "tokens_seen": 763303936 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038824473420260783, + "loss": 3.0412, + "theoretical_loss": 3.7465991395459226, + "tokens_seen": 763369472 + }, + { + "epoch": 2.02, + "learning_rate": 0.000388234704112337, + "loss": 2.9101, + "theoretical_loss": 3.746566985304008, + "tokens_seen": 763435008 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1234269, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.944852352142334, + "objective/train/theoretical_loss": 3.7465509095079335, + "objective/train/tokens_used": 783927776, + "theoretical_loss": 3.7465509095079335, + "tokens_seen": 763467776 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003882246740220662, + "loss": 3.1267, + "theoretical_loss": 3.746534834595, + "tokens_seen": 763500544 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038821464393179537, + "loss": 3.062, + "theoretical_loss": 3.746502687418208, + "tokens_seen": 763566080 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038820461384152455, + "loss": 2.9545, + "theoretical_loss": 3.746470543772941, + "tokens_seen": 763631616 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881945837512538, + "loss": 3.0456, + "theoretical_loss": 3.7464384036585074, + "tokens_seen": 763697152 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881845536609829, + "loss": 2.9763, + "theoretical_loss": 3.7464062670742173, + "tokens_seen": 763762688 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038817452357071215, + "loss": 3.0143, + "theoretical_loss": 3.746374134019379, + "tokens_seen": 763828224 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038816449348044133, + "loss": 3.0956, + "theoretical_loss": 3.746342004493303, + "tokens_seen": 763893760 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881544633901705, + "loss": 2.9893, + "theoretical_loss": 3.746309878495299, + "tokens_seen": 763959296 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881444332998997, + "loss": 2.921, + "theoretical_loss": 3.7462777560246767, + "tokens_seen": 764024832 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881344032096289, + "loss": 2.8542, + "theoretical_loss": 3.7462456370807464, + "tokens_seen": 764090368 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881243731193581, + "loss": 3.0407, + "theoretical_loss": 3.746213521662818, + "tokens_seen": 764155904 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003881143430290873, + "loss": 3.0279, + "theoretical_loss": 3.7461814097702026, + "tokens_seen": 764221440 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038810431293881647, + "loss": 2.8578, + "theoretical_loss": 3.7461493014022116, + "tokens_seen": 764286976 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038809428284854565, + "loss": 2.9673, + "theoretical_loss": 3.7461171965581546, + "tokens_seen": 764352512 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038808425275827483, + "loss": 2.9195, + "theoretical_loss": 3.7460850952373432, + "tokens_seen": 764418048 + }, + { + "epoch": 2.02, + "learning_rate": 0.000388074222668004, + "loss": 2.794, + "theoretical_loss": 3.7460529974390893, + "tokens_seen": 764483584 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038806419257773325, + "loss": 2.9653, + "theoretical_loss": 3.7460209031627034, + "tokens_seen": 764549120 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003880541624874624, + "loss": 3.0058, + "theoretical_loss": 3.7459888124074983, + "tokens_seen": 764614656 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003880441323971916, + "loss": 2.8937, + "theoretical_loss": 3.7459567251727854, + "tokens_seen": 764680192 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003880341023069208, + "loss": 3.1167, + "theoretical_loss": 3.7459246414578775, + "tokens_seen": 764745728 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038802407221665, + "loss": 3.0756, + "theoretical_loss": 3.7458925612620853, + "tokens_seen": 764811264 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038801404212637916, + "loss": 2.8738, + "theoretical_loss": 3.745860484584723, + "tokens_seen": 764876800 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038800401203610834, + "loss": 2.9871, + "theoretical_loss": 3.7458284114251024, + "tokens_seen": 764942336 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003879939819458375, + "loss": 3.1953, + "theoretical_loss": 3.745796341782537, + "tokens_seen": 765007872 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038798395185556675, + "loss": 2.9329, + "theoretical_loss": 3.745764275656339, + "tokens_seen": 765073408 + }, + { + "epoch": 2.02, + "objective/train/docs_used": 1236952, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7477328777313232, + "objective/train/theoretical_loss": 3.745748243911663, + "objective/train/tokens_used": 785566176, + "theoretical_loss": 3.745748243911663, + "tokens_seen": 765106176 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003879739217652959, + "loss": 2.8671, + "theoretical_loss": 3.745732213045822, + "tokens_seen": 765138944 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003879638916750251, + "loss": 3.1259, + "theoretical_loss": 3.7457001539503, + "tokens_seen": 765204480 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038795386158475424, + "loss": 2.9883, + "theoretical_loss": 3.7456680983690864, + "tokens_seen": 765270016 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003879438314944835, + "loss": 2.8561, + "theoretical_loss": 3.7456360463014944, + "tokens_seen": 765335552 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038793380140421266, + "loss": 2.8567, + "theoretical_loss": 3.7456039977468394, + "tokens_seen": 765401088 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038792377131394184, + "loss": 2.9809, + "theoretical_loss": 3.7455719527044344, + "tokens_seen": 765466624 + }, + { + "epoch": 2.02, + "learning_rate": 0.000387913741223671, + "loss": 3.0177, + "theoretical_loss": 3.7455399111735943, + "tokens_seen": 765532160 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003879037111334002, + "loss": 3.0091, + "theoretical_loss": 3.7455078731536338, + "tokens_seen": 765597696 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003878936810431294, + "loss": 2.9807, + "theoretical_loss": 3.745475838643867, + "tokens_seen": 765663232 + }, + { + "epoch": 2.02, + "learning_rate": 0.0003878836509528586, + "loss": 2.9106, + "theoretical_loss": 3.7454438076436105, + "tokens_seen": 765728768 + }, + { + "epoch": 2.02, + "learning_rate": 0.00038787362086258775, + "loss": 2.8483, + "theoretical_loss": 3.7454117801521782, + "tokens_seen": 765794304 + }, + { + "epoch": 2.02, + "learning_rate": 0.000387863590772317, + "loss": 2.8418, + "theoretical_loss": 3.7453797561688855, + "tokens_seen": 765859840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038785356068204616, + "loss": 3.1036, + "theoretical_loss": 3.7453477356930485, + "tokens_seen": 765925376 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038784353059177534, + "loss": 3.0554, + "theoretical_loss": 3.7453157187239827, + "tokens_seen": 765990912 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003878335005015045, + "loss": 2.9827, + "theoretical_loss": 3.745283705261004, + "tokens_seen": 766056448 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003878234704112337, + "loss": 2.973, + "theoretical_loss": 3.745251695303429, + "tokens_seen": 766121984 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003878134403209629, + "loss": 3.0363, + "theoretical_loss": 3.7452196888505735, + "tokens_seen": 766187520 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003878034102306921, + "loss": 3.0158, + "theoretical_loss": 3.745187685901755, + "tokens_seen": 766253056 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038779338014042125, + "loss": 3.0016, + "theoretical_loss": 3.745155686456289, + "tokens_seen": 766318592 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003877833500501505, + "loss": 3.0028, + "theoretical_loss": 3.745123690513493, + "tokens_seen": 766384128 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003877733199598796, + "loss": 2.9449, + "theoretical_loss": 3.7450916980726845, + "tokens_seen": 766449664 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038776328986960885, + "loss": 2.8778, + "theoretical_loss": 3.74505970913318, + "tokens_seen": 766515200 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038775325977933803, + "loss": 3.1647, + "theoretical_loss": 3.7450277236942977, + "tokens_seen": 766580736 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003877432296890672, + "loss": 2.8239, + "theoretical_loss": 3.7449957417553548, + "tokens_seen": 766646272 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003877331995987964, + "loss": 2.8203, + "theoretical_loss": 3.744963763315669, + "tokens_seen": 766711808 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1239797, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0547590255737305, + "objective/train/theoretical_loss": 3.744947775407835, + "objective/train/tokens_used": 787204576, + "theoretical_loss": 3.744947775407835, + "tokens_seen": 766744576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038772316950852557, + "loss": 3.139, + "theoretical_loss": 3.7449317883745596, + "tokens_seen": 766777344 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038771313941825475, + "loss": 2.9648, + "theoretical_loss": 3.7448998169313437, + "tokens_seen": 766842880 + }, + { + "epoch": 2.03, + "learning_rate": 0.000387703109327984, + "loss": 3.0615, + "theoretical_loss": 3.74486784898534, + "tokens_seen": 766908416 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876930792377131, + "loss": 2.8529, + "theoretical_loss": 3.744835884535868, + "tokens_seen": 766973952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038768304914744235, + "loss": 3.0979, + "theoretical_loss": 3.7448039235822446, + "tokens_seen": 767039488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038767301905717153, + "loss": 3.166, + "theoretical_loss": 3.744771966123791, + "tokens_seen": 767105024 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876629889669007, + "loss": 3.105, + "theoretical_loss": 3.744740012159825, + "tokens_seen": 767170560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876529588766299, + "loss": 2.9752, + "theoretical_loss": 3.7447080616896664, + "tokens_seen": 767236096 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876429287863591, + "loss": 2.8694, + "theoretical_loss": 3.744676114712635, + "tokens_seen": 767301632 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038763289869608826, + "loss": 2.9146, + "theoretical_loss": 3.7446441712280505, + "tokens_seen": 767367168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876228686058175, + "loss": 3.01, + "theoretical_loss": 3.744612231235233, + "tokens_seen": 767432704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003876128385155466, + "loss": 3.09, + "theoretical_loss": 3.7445802947335025, + "tokens_seen": 767498240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038760280842527585, + "loss": 2.9301, + "theoretical_loss": 3.7445483617221793, + "tokens_seen": 767563776 + }, + { + "epoch": 2.03, + "learning_rate": 0.000387592778335005, + "loss": 2.8034, + "theoretical_loss": 3.744516432200584, + "tokens_seen": 767629312 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875827482447342, + "loss": 3.081, + "theoretical_loss": 3.7444845061680376, + "tokens_seen": 767694848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875727181544634, + "loss": 2.9936, + "theoretical_loss": 3.7444525836238607, + "tokens_seen": 767760384 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875626880641926, + "loss": 2.853, + "theoretical_loss": 3.7444206645673748, + "tokens_seen": 767825920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038755265797392176, + "loss": 3.0225, + "theoretical_loss": 3.744388748997901, + "tokens_seen": 767891456 + }, + { + "epoch": 2.03, + "learning_rate": 0.000387542627883651, + "loss": 2.9309, + "theoretical_loss": 3.7443568369147604, + "tokens_seen": 767956992 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875325977933801, + "loss": 2.9964, + "theoretical_loss": 3.7443249283172753, + "tokens_seen": 768022528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038752256770310936, + "loss": 2.9178, + "theoretical_loss": 3.7442930232047678, + "tokens_seen": 768088064 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875125376128385, + "loss": 2.9252, + "theoretical_loss": 3.7442611215765593, + "tokens_seen": 768153600 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003875025075225677, + "loss": 3.0851, + "theoretical_loss": 3.7442292234319723, + "tokens_seen": 768219136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003874924774322969, + "loss": 3.0973, + "theoretical_loss": 3.744197328770329, + "tokens_seen": 768284672 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003874824473420261, + "loss": 2.9802, + "theoretical_loss": 3.7441654375909525, + "tokens_seen": 768350208 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1242843, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.195178508758545, + "objective/train/theoretical_loss": 3.744149493306903, + "objective/train/tokens_used": 788842976, + "theoretical_loss": 3.744149493306903, + "tokens_seen": 768382976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038747241725175526, + "loss": 3.0888, + "theoretical_loss": 3.744133549893166, + "tokens_seen": 768415744 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038746238716148444, + "loss": 2.8997, + "theoretical_loss": 3.7441016656762915, + "tokens_seen": 768481280 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003874523570712136, + "loss": 2.9898, + "theoretical_loss": 3.7440697849396525, + "tokens_seen": 768546816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038744232698094286, + "loss": 2.9246, + "theoretical_loss": 3.744037907682573, + "tokens_seen": 768612352 + }, + { + "epoch": 2.03, + "learning_rate": 0.000387432296890672, + "loss": 2.9615, + "theoretical_loss": 3.7440060339043764, + "tokens_seen": 768677888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003874222668004012, + "loss": 2.9298, + "theoretical_loss": 3.7439741636043857, + "tokens_seen": 768743424 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038741223671013035, + "loss": 3.0435, + "theoretical_loss": 3.743942296781926, + "tokens_seen": 768808960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003874022066198596, + "loss": 2.9706, + "theoretical_loss": 3.7439104334363207, + "tokens_seen": 768874496 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038739217652958876, + "loss": 2.7536, + "theoretical_loss": 3.743878573566894, + "tokens_seen": 768940032 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038738214643931795, + "loss": 2.8831, + "theoretical_loss": 3.7438467171729712, + "tokens_seen": 769005568 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003873721163490472, + "loss": 2.8761, + "theoretical_loss": 3.743814864253877, + "tokens_seen": 769071104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038736208625877636, + "loss": 2.8513, + "theoretical_loss": 3.743783014808936, + "tokens_seen": 769136640 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038735205616850554, + "loss": 2.7194, + "theoretical_loss": 3.7437511688374725, + "tokens_seen": 769202176 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003873420260782347, + "loss": 3.1493, + "theoretical_loss": 3.743719326338813, + "tokens_seen": 769267712 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003873319959879639, + "loss": 2.9219, + "theoretical_loss": 3.7436874873122825, + "tokens_seen": 769333248 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003873219658976931, + "loss": 2.9724, + "theoretical_loss": 3.743655651757207, + "tokens_seen": 769398784 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003873119358074223, + "loss": 2.9894, + "theoretical_loss": 3.7436238196729117, + "tokens_seen": 769464320 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038730190571715145, + "loss": 2.8999, + "theoretical_loss": 3.743591991058723, + "tokens_seen": 769529856 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872918756268807, + "loss": 3.003, + "theoretical_loss": 3.7435601659139675, + "tokens_seen": 769595392 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872818455366098, + "loss": 2.8466, + "theoretical_loss": 3.743528344237971, + "tokens_seen": 769660928 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038727181544633905, + "loss": 2.8713, + "theoretical_loss": 3.74349652603006, + "tokens_seen": 769726464 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038726178535606823, + "loss": 2.9975, + "theoretical_loss": 3.743464711289562, + "tokens_seen": 769792000 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872517552657974, + "loss": 3.0192, + "theoretical_loss": 3.7434329000158035, + "tokens_seen": 769857536 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872417251755266, + "loss": 3.0127, + "theoretical_loss": 3.7434010922081127, + "tokens_seen": 769923072 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038723169508525577, + "loss": 2.8993, + "theoretical_loss": 3.743369287865815, + "tokens_seen": 769988608 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1244287, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1046810150146484, + "objective/train/theoretical_loss": 3.743353386993979, + "objective/train/tokens_used": 790481376, + "theoretical_loss": 3.743353386993979, + "tokens_seen": 770021376 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038722166499498495, + "loss": 3.0496, + "theoretical_loss": 3.74333748698824, + "tokens_seen": 770054144 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872116349047142, + "loss": 2.9108, + "theoretical_loss": 3.743305689574714, + "tokens_seen": 770119680 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003872016048144433, + "loss": 3.0026, + "theoretical_loss": 3.743273895624565, + "tokens_seen": 770185216 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038719157472417255, + "loss": 3.0047, + "theoretical_loss": 3.7432421051371225, + "tokens_seen": 770250752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038718154463390173, + "loss": 2.9938, + "theoretical_loss": 3.7432103181117133, + "tokens_seen": 770316288 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871715145436309, + "loss": 2.9242, + "theoretical_loss": 3.743178534547666, + "tokens_seen": 770381824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871614844533601, + "loss": 2.9392, + "theoretical_loss": 3.7431467544443104, + "tokens_seen": 770447360 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871514543630893, + "loss": 3.0262, + "theoretical_loss": 3.7431149778009742, + "tokens_seen": 770512896 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038714142427281846, + "loss": 2.9485, + "theoretical_loss": 3.7430832046169877, + "tokens_seen": 770578432 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871313941825477, + "loss": 2.8429, + "theoretical_loss": 3.7430514348916786, + "tokens_seen": 770643968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871213640922768, + "loss": 2.7321, + "theoretical_loss": 3.743019668624377, + "tokens_seen": 770709504 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038711133400200605, + "loss": 3.0818, + "theoretical_loss": 3.742987905814413, + "tokens_seen": 770775040 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003871013039117352, + "loss": 2.9602, + "theoretical_loss": 3.7429561464611156, + "tokens_seen": 770840576 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870912738214644, + "loss": 3.0945, + "theoretical_loss": 3.742924390563816, + "tokens_seen": 770906112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870812437311936, + "loss": 2.9896, + "theoretical_loss": 3.742892638121843, + "tokens_seen": 770971648 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870712136409228, + "loss": 2.7858, + "theoretical_loss": 3.742860889134527, + "tokens_seen": 771037184 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038706118355065196, + "loss": 2.9759, + "theoretical_loss": 3.7428291436012, + "tokens_seen": 771102720 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870511534603812, + "loss": 2.8853, + "theoretical_loss": 3.742797401521191, + "tokens_seen": 771168256 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870411233701103, + "loss": 2.9405, + "theoretical_loss": 3.742765662893832, + "tokens_seen": 771233792 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038703109327983956, + "loss": 2.88, + "theoretical_loss": 3.742733927718454, + "tokens_seen": 771299328 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870210631895687, + "loss": 2.8591, + "theoretical_loss": 3.742702195994388, + "tokens_seen": 771364864 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870110330992979, + "loss": 2.9281, + "theoretical_loss": 3.7426704677209655, + "tokens_seen": 771430400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003870010030090271, + "loss": 3.0309, + "theoretical_loss": 3.742638742897518, + "tokens_seen": 771495936 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003869909729187563, + "loss": 2.8782, + "theoretical_loss": 3.742607021523378, + "tokens_seen": 771561472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038698094282848546, + "loss": 2.9055, + "theoretical_loss": 3.742575303597877, + "tokens_seen": 771627008 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1247133, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6997873783111572, + "objective/train/theoretical_loss": 3.7425594459281575, + "objective/train/tokens_used": 792119776, + "theoretical_loss": 3.7425594459281575, + "tokens_seen": 771659776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038697091273821464, + "loss": 2.8764, + "theoretical_loss": 3.7425435891203476, + "tokens_seen": 771692544 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003869608826479438, + "loss": 2.9603, + "theoretical_loss": 3.7425118780901214, + "tokens_seen": 771758080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038695085255767306, + "loss": 3.0837, + "theoretical_loss": 3.742480170506532, + "tokens_seen": 771823616 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003869408224674022, + "loss": 3.0623, + "theoretical_loss": 3.742448466368912, + "tokens_seen": 771889152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003869307923771314, + "loss": 3.0836, + "theoretical_loss": 3.7424167656765936, + "tokens_seen": 771954688 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038692076228686055, + "loss": 2.9198, + "theoretical_loss": 3.74238506842891, + "tokens_seen": 772020224 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003869107321965898, + "loss": 3.1092, + "theoretical_loss": 3.742353374625196, + "tokens_seen": 772085760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038690070210631896, + "loss": 3.0053, + "theoretical_loss": 3.742321684264784, + "tokens_seen": 772151296 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038689067201604815, + "loss": 3.0202, + "theoretical_loss": 3.742289997347007, + "tokens_seen": 772216832 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038688064192577733, + "loss": 3.1724, + "theoretical_loss": 3.7422583138712002, + "tokens_seen": 772282368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038687061183550656, + "loss": 2.8469, + "theoretical_loss": 3.7422266338366974, + "tokens_seen": 772347904 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003868605817452357, + "loss": 3.1027, + "theoretical_loss": 3.7421949572428326, + "tokens_seen": 772413440 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003868505516549649, + "loss": 3.0701, + "theoretical_loss": 3.7421632840889405, + "tokens_seen": 772478976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038684052156469405, + "loss": 2.9014, + "theoretical_loss": 3.742131614374355, + "tokens_seen": 772544512 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003868304914744233, + "loss": 2.8904, + "theoretical_loss": 3.742099948098412, + "tokens_seen": 772610048 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038682046138415247, + "loss": 2.884, + "theoretical_loss": 3.7420682852604457, + "tokens_seen": 772675584 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038681043129388165, + "loss": 3.0454, + "theoretical_loss": 3.7420366258597912, + "tokens_seen": 772741120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038680040120361083, + "loss": 3.0141, + "theoretical_loss": 3.7420049698957847, + "tokens_seen": 772806656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038679037111334, + "loss": 3.1175, + "theoretical_loss": 3.741973317367761, + "tokens_seen": 772872192 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003867803410230692, + "loss": 2.8015, + "theoretical_loss": 3.7419416682750573, + "tokens_seen": 772937728 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038677031093279843, + "loss": 2.8277, + "theoretical_loss": 3.741910022617007, + "tokens_seen": 773003264 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038676028084252755, + "loss": 2.9981, + "theoretical_loss": 3.741878380392948, + "tokens_seen": 773068800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003867502507522568, + "loss": 2.993, + "theoretical_loss": 3.7418467416022168, + "tokens_seen": 773134336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003867402206619859, + "loss": 2.9009, + "theoretical_loss": 3.7418151062441485, + "tokens_seen": 773199872 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038673019057171515, + "loss": 3.0323, + "theoretical_loss": 3.7417834743180807, + "tokens_seen": 773265408 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1250187, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.101785182952881, + "objective/train/theoretical_loss": 3.74176765964184, + "objective/train/tokens_used": 793758176, + "theoretical_loss": 3.74176765964184, + "tokens_seen": 773298176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038672016048144433, + "loss": 2.9849, + "theoretical_loss": 3.7417518458233507, + "tokens_seen": 773330944 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003867101303911735, + "loss": 3.0942, + "theoretical_loss": 3.7417202207592943, + "tokens_seen": 773396480 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003867001003009027, + "loss": 2.9671, + "theoretical_loss": 3.7416885991252498, + "tokens_seen": 773462016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038669007021063193, + "loss": 3.0604, + "theoretical_loss": 3.7416569809205535, + "tokens_seen": 773527552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038668004012036106, + "loss": 2.9578, + "theoretical_loss": 3.741625366144544, + "tokens_seen": 773593088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003866700100300903, + "loss": 2.8602, + "theoretical_loss": 3.741593754796559, + "tokens_seen": 773658624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003866599799398194, + "loss": 3.0705, + "theoretical_loss": 3.741562146875936, + "tokens_seen": 773724160 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038664994984954866, + "loss": 2.9068, + "theoretical_loss": 3.741530542382013, + "tokens_seen": 773789696 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038663991975927784, + "loss": 2.7628, + "theoretical_loss": 3.7414989413141293, + "tokens_seen": 773855232 + }, + { + "epoch": 2.03, + "learning_rate": 0.000386629889669007, + "loss": 2.8192, + "theoretical_loss": 3.7414673436716224, + "tokens_seen": 773920768 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038661985957873625, + "loss": 3.0063, + "theoretical_loss": 3.7414357494538315, + "tokens_seen": 773986304 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003866098294884654, + "loss": 3.0874, + "theoretical_loss": 3.741404158660095, + "tokens_seen": 774051840 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865997993981946, + "loss": 2.9573, + "theoretical_loss": 3.7413725712897525, + "tokens_seen": 774117376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865897693079238, + "loss": 2.9387, + "theoretical_loss": 3.7413409873421433, + "tokens_seen": 774182912 + }, + { + "epoch": 2.03, + "learning_rate": 0.000386579739217653, + "loss": 3.0302, + "theoretical_loss": 3.7413094068166064, + "tokens_seen": 774248448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038656970912738216, + "loss": 2.9394, + "theoretical_loss": 3.7412778297124816, + "tokens_seen": 774313984 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865596790371114, + "loss": 2.8831, + "theoretical_loss": 3.7412462560291084, + "tokens_seen": 774379520 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865496489468405, + "loss": 3.0063, + "theoretical_loss": 3.7412146857658275, + "tokens_seen": 774445056 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038653961885656976, + "loss": 2.9536, + "theoretical_loss": 3.7411831189219784, + "tokens_seen": 774510592 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865295887662989, + "loss": 2.8818, + "theoretical_loss": 3.7411515554969013, + "tokens_seen": 774576128 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865195586760281, + "loss": 3.0532, + "theoretical_loss": 3.741119995489937, + "tokens_seen": 774641664 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003865095285857573, + "loss": 3.087, + "theoretical_loss": 3.741088438900427, + "tokens_seen": 774707200 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003864994984954865, + "loss": 3.0369, + "theoretical_loss": 3.741056885727711, + "tokens_seen": 774772736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038648946840521566, + "loss": 2.976, + "theoretical_loss": 3.7410253359711305, + "tokens_seen": 774838272 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038647943831494484, + "loss": 2.9385, + "theoretical_loss": 3.7409937896300276, + "tokens_seen": 774903808 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1253205, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0192980766296387, + "objective/train/theoretical_loss": 3.7409780177400735, + "objective/train/tokens_used": 795396576, + "theoretical_loss": 3.7409780177400735, + "tokens_seen": 774936576 + }, + { + "epoch": 2.03, + "learning_rate": 0.000386469408224674, + "loss": 2.9998, + "theoretical_loss": 3.740962246703742, + "tokens_seen": 774969344 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038645937813440326, + "loss": 2.9984, + "theoretical_loss": 3.740930707191617, + "tokens_seen": 775034880 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003864493480441324, + "loss": 2.7061, + "theoretical_loss": 3.740899171092993, + "tokens_seen": 775100416 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003864393179538616, + "loss": 2.8419, + "theoretical_loss": 3.740867638407213, + "tokens_seen": 775165952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038642928786359075, + "loss": 2.9594, + "theoretical_loss": 3.740836109133619, + "tokens_seen": 775231488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038641925777332, + "loss": 3.0518, + "theoretical_loss": 3.7408045832715526, + "tokens_seen": 775297024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038640922768304917, + "loss": 2.8553, + "theoretical_loss": 3.7407730608203575, + "tokens_seen": 775362560 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038639919759277835, + "loss": 2.9, + "theoretical_loss": 3.740741541779376, + "tokens_seen": 775428096 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038638916750250753, + "loss": 2.9957, + "theoretical_loss": 3.7407100261479505, + "tokens_seen": 775493632 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038637913741223676, + "loss": 2.9349, + "theoretical_loss": 3.7406785139254244, + "tokens_seen": 775559168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003863691073219659, + "loss": 2.858, + "theoretical_loss": 3.740647005111141, + "tokens_seen": 775624704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003863590772316951, + "loss": 3.0049, + "theoretical_loss": 3.740615499704444, + "tokens_seen": 775690240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038634904714142425, + "loss": 3.0088, + "theoretical_loss": 3.740583997704676, + "tokens_seen": 775755776 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003863390170511535, + "loss": 3.087, + "theoretical_loss": 3.7405524991111827, + "tokens_seen": 775821312 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038632898696088267, + "loss": 2.9187, + "theoretical_loss": 3.7405210039233063, + "tokens_seen": 775886848 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038631895687061185, + "loss": 2.9454, + "theoretical_loss": 3.7404895121403916, + "tokens_seen": 775952384 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038630892678034103, + "loss": 3.0247, + "theoretical_loss": 3.7404580237617835, + "tokens_seen": 776017920 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003862988966900702, + "loss": 2.9093, + "theoretical_loss": 3.7404265387868256, + "tokens_seen": 776083456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003862888665997994, + "loss": 2.8628, + "theoretical_loss": 3.7403950572148634, + "tokens_seen": 776148992 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038627883650952863, + "loss": 2.8645, + "theoretical_loss": 3.740363579045241, + "tokens_seen": 776214528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038626880641925775, + "loss": 3.0117, + "theoretical_loss": 3.7403321042773046, + "tokens_seen": 776280064 + }, + { + "epoch": 2.03, + "learning_rate": 0.000386258776328987, + "loss": 2.9795, + "theoretical_loss": 3.7403006329103983, + "tokens_seen": 776345600 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003862487462387161, + "loss": 3.0301, + "theoretical_loss": 3.740269164943868, + "tokens_seen": 776411136 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038623871614844535, + "loss": 3.1505, + "theoretical_loss": 3.7402377003770595, + "tokens_seen": 776476672 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038622868605817453, + "loss": 3.0255, + "theoretical_loss": 3.7402062392093187, + "tokens_seen": 776542208 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1255142, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1912689208984375, + "objective/train/theoretical_loss": 3.740190509899894, + "objective/train/tokens_used": 797034976, + "theoretical_loss": 3.740190509899894, + "tokens_seen": 776574976 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003862186559679037, + "loss": 2.9593, + "theoretical_loss": 3.740174781439991, + "tokens_seen": 776607744 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003862086258776329, + "loss": 3.009, + "theoretical_loss": 3.7401433270684232, + "tokens_seen": 776673280 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038619859578736213, + "loss": 3.0447, + "theoretical_loss": 3.740111876093961, + "tokens_seen": 776738816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038618856569709126, + "loss": 3.0228, + "theoretical_loss": 3.7400804285159515, + "tokens_seen": 776804352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003861785356068205, + "loss": 3.0663, + "theoretical_loss": 3.7400489843337414, + "tokens_seen": 776869888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003861685055165496, + "loss": 2.9648, + "theoretical_loss": 3.740017543546678, + "tokens_seen": 776935424 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038615847542627886, + "loss": 2.8788, + "theoretical_loss": 3.739986106154107, + "tokens_seen": 777000960 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038614844533600804, + "loss": 3.0057, + "theoretical_loss": 3.7399546721553762, + "tokens_seen": 777066496 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003861384152457372, + "loss": 2.9087, + "theoretical_loss": 3.7399232415498336, + "tokens_seen": 777132032 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003861283851554664, + "loss": 2.9005, + "theoretical_loss": 3.7398918143368265, + "tokens_seen": 777197568 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003861183550651956, + "loss": 2.9079, + "theoretical_loss": 3.7398603905157026, + "tokens_seen": 777263104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038610832497492476, + "loss": 2.9171, + "theoretical_loss": 3.7398289700858105, + "tokens_seen": 777328640 + }, + { + "epoch": 2.03, + "learning_rate": 0.000386098294884654, + "loss": 2.8614, + "theoretical_loss": 3.7397975530464973, + "tokens_seen": 777394176 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860882647943831, + "loss": 3.1159, + "theoretical_loss": 3.739766139397112, + "tokens_seen": 777459712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038607823470411236, + "loss": 2.7772, + "theoretical_loss": 3.7397347291370027, + "tokens_seen": 777525248 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860682046138415, + "loss": 2.861, + "theoretical_loss": 3.7397033222655187, + "tokens_seen": 777590784 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860581745235707, + "loss": 3.0444, + "theoretical_loss": 3.739671918782008, + "tokens_seen": 777656320 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860481444332999, + "loss": 3.0496, + "theoretical_loss": 3.739640518685821, + "tokens_seen": 777721856 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860381143430291, + "loss": 3.007, + "theoretical_loss": 3.739609121976306, + "tokens_seen": 777787392 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038602808425275826, + "loss": 2.8956, + "theoretical_loss": 3.7395777286528125, + "tokens_seen": 777852928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860180541624875, + "loss": 2.8911, + "theoretical_loss": 3.73954633871469, + "tokens_seen": 777918464 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003860080240722166, + "loss": 2.961, + "theoretical_loss": 3.7395149521612883, + "tokens_seen": 777984000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038599799398194586, + "loss": 3.0121, + "theoretical_loss": 3.7394835689919574, + "tokens_seen": 778049536 + }, + { + "epoch": 2.03, + "learning_rate": 0.000385987963891675, + "loss": 3.0805, + "theoretical_loss": 3.7394521892060477, + "tokens_seen": 778115072 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003859779338014042, + "loss": 2.9778, + "theoretical_loss": 3.7394208128029094, + "tokens_seen": 778180608 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1257930, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7905778884887695, + "objective/train/theoretical_loss": 3.739405125869676, + "objective/train/tokens_used": 798673376, + "theoretical_loss": 3.739405125869676, + "tokens_seen": 778213376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003859679037111334, + "loss": 3.0339, + "theoretical_loss": 3.7393894397818928, + "tokens_seen": 778246144 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003859578736208626, + "loss": 2.9583, + "theoretical_loss": 3.7393580701423486, + "tokens_seen": 778311680 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038594784353059177, + "loss": 2.9277, + "theoretical_loss": 3.739326703883628, + "tokens_seen": 778377216 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038593781344032095, + "loss": 3.0814, + "theoretical_loss": 3.7392953410050813, + "tokens_seen": 778442752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038592778335005013, + "loss": 2.9775, + "theoretical_loss": 3.7392639815060607, + "tokens_seen": 778508288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038591775325977937, + "loss": 3.0065, + "theoretical_loss": 3.7392326253859167, + "tokens_seen": 778573824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003859077231695085, + "loss": 2.9574, + "theoretical_loss": 3.739201272644001, + "tokens_seen": 778639360 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038589769307923773, + "loss": 3.1703, + "theoretical_loss": 3.7391699232796665, + "tokens_seen": 778704896 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038588766298896685, + "loss": 3.075, + "theoretical_loss": 3.7391385772922634, + "tokens_seen": 778770432 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003858776328986961, + "loss": 3.0903, + "theoretical_loss": 3.739107234681145, + "tokens_seen": 778835968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003858676028084253, + "loss": 3.1003, + "theoretical_loss": 3.739075895445663, + "tokens_seen": 778901504 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038585757271815445, + "loss": 3.0428, + "theoretical_loss": 3.73904455958517, + "tokens_seen": 778967040 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003858475426278837, + "loss": 2.966, + "theoretical_loss": 3.739013227099019, + "tokens_seen": 779032576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038583751253761287, + "loss": 3.0155, + "theoretical_loss": 3.7389818979865623, + "tokens_seen": 779098112 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038582748244734205, + "loss": 3.1165, + "theoretical_loss": 3.738950572247153, + "tokens_seen": 779163648 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038581745235707123, + "loss": 2.8996, + "theoretical_loss": 3.738919249880145, + "tokens_seen": 779229184 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003858074222668004, + "loss": 3.0221, + "theoretical_loss": 3.7388879308848906, + "tokens_seen": 779294720 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003857973921765296, + "loss": 2.9922, + "theoretical_loss": 3.738856615260744, + "tokens_seen": 779360256 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038578736208625883, + "loss": 3.0178, + "theoretical_loss": 3.738825303007059, + "tokens_seen": 779425792 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038577733199598796, + "loss": 2.8648, + "theoretical_loss": 3.7387939941231885, + "tokens_seen": 779491328 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003857673019057172, + "loss": 2.9049, + "theoretical_loss": 3.7387626886084875, + "tokens_seen": 779556864 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003857572718154463, + "loss": 2.9428, + "theoretical_loss": 3.73873138646231, + "tokens_seen": 779622400 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038574724172517555, + "loss": 2.8621, + "theoretical_loss": 3.738700087684011, + "tokens_seen": 779687936 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038573721163490473, + "loss": 2.9442, + "theoretical_loss": 3.738668792272944, + "tokens_seen": 779753472 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003857271815446339, + "loss": 3.1471, + "theoretical_loss": 3.7386375002284638, + "tokens_seen": 779819008 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1260528, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.577179193496704, + "objective/train/theoretical_loss": 3.7386218554684927, + "objective/train/tokens_used": 800311776, + "theoretical_loss": 3.7386218554684927, + "tokens_seen": 779851776 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003857171514543631, + "loss": 2.7138, + "theoretical_loss": 3.738606211549927, + "tokens_seen": 779884544 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038570712136409233, + "loss": 2.9294, + "theoretical_loss": 3.738574926236687, + "tokens_seen": 779950080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038569709127382146, + "loss": 3.0492, + "theoretical_loss": 3.7385436442880993, + "tokens_seen": 780015616 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856870611835507, + "loss": 2.8795, + "theoretical_loss": 3.738512365703521, + "tokens_seen": 780081152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856770310932798, + "loss": 2.8153, + "theoretical_loss": 3.738481090482306, + "tokens_seen": 780146688 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038566700100300906, + "loss": 2.9438, + "theoretical_loss": 3.7384498186238106, + "tokens_seen": 780212224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038565697091273824, + "loss": 3.0219, + "theoretical_loss": 3.738418550127391, + "tokens_seen": 780277760 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856469408224674, + "loss": 2.8389, + "theoretical_loss": 3.7383872849924034, + "tokens_seen": 780343296 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856369107321966, + "loss": 2.9388, + "theoretical_loss": 3.738356023218204, + "tokens_seen": 780408832 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856268806419258, + "loss": 2.9844, + "theoretical_loss": 3.7383247648041493, + "tokens_seen": 780474368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038561685055165496, + "loss": 2.8863, + "theoretical_loss": 3.738293509749597, + "tokens_seen": 780539904 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003856068204613842, + "loss": 3.0101, + "theoretical_loss": 3.7382622580539024, + "tokens_seen": 780605440 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855967903711133, + "loss": 3.0634, + "theoretical_loss": 3.738231009716424, + "tokens_seen": 780670976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038558676028084256, + "loss": 3.1272, + "theoretical_loss": 3.7381997647365184, + "tokens_seen": 780736512 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855767301905717, + "loss": 2.9215, + "theoretical_loss": 3.7381685231135435, + "tokens_seen": 780802048 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855667001003009, + "loss": 3.0153, + "theoretical_loss": 3.738137284846856, + "tokens_seen": 780867584 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855566700100301, + "loss": 2.9714, + "theoretical_loss": 3.7381060499358143, + "tokens_seen": 780933120 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855466399197593, + "loss": 2.9247, + "theoretical_loss": 3.7380748183797765, + "tokens_seen": 780998656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038553660982948846, + "loss": 2.9996, + "theoretical_loss": 3.7380435901781004, + "tokens_seen": 781064192 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855265797392177, + "loss": 3.0503, + "theoretical_loss": 3.738012365330145, + "tokens_seen": 781129728 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003855165496489468, + "loss": 2.8926, + "theoretical_loss": 3.7379811438352677, + "tokens_seen": 781195264 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038550651955867606, + "loss": 2.9188, + "theoretical_loss": 3.737949925692828, + "tokens_seen": 781260800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003854964894684052, + "loss": 2.9695, + "theoretical_loss": 3.7379187109021847, + "tokens_seen": 781326336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003854864593781344, + "loss": 2.9056, + "theoretical_loss": 3.7378874994626967, + "tokens_seen": 781391872 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003854764292878636, + "loss": 2.8639, + "theoretical_loss": 3.737856291373723, + "tokens_seen": 781457408 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1263282, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.057778835296631, + "objective/train/theoretical_loss": 3.7378406885854787, + "objective/train/tokens_used": 801950176, + "theoretical_loss": 3.7378406885854787, + "tokens_seen": 781490176 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003854663991975928, + "loss": 2.8975, + "theoretical_loss": 3.737825086634623, + "tokens_seen": 781522944 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038545636910732197, + "loss": 2.8535, + "theoretical_loss": 3.7377938852447565, + "tokens_seen": 781588480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038544633901705115, + "loss": 2.9368, + "theoretical_loss": 3.7377626872034835, + "tokens_seen": 781654016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038543630892678033, + "loss": 2.751, + "theoretical_loss": 3.7377314925101635, + "tokens_seen": 781719552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038542627883650957, + "loss": 3.0417, + "theoretical_loss": 3.7377003011641565, + "tokens_seen": 781785088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003854162487462387, + "loss": 2.9606, + "theoretical_loss": 3.7376691131648228, + "tokens_seen": 781850624 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038540621865596793, + "loss": 2.9484, + "theoretical_loss": 3.737637928511523, + "tokens_seen": 781916160 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038539618856569705, + "loss": 2.9606, + "theoretical_loss": 3.7376067472036185, + "tokens_seen": 781981696 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003853861584754263, + "loss": 3.1285, + "theoretical_loss": 3.737575569240468, + "tokens_seen": 782047232 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038537612838515547, + "loss": 2.9973, + "theoretical_loss": 3.737544394621435, + "tokens_seen": 782112768 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038536609829488465, + "loss": 2.8044, + "theoretical_loss": 3.737513223345878, + "tokens_seen": 782178304 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038535606820461383, + "loss": 3.0342, + "theoretical_loss": 3.7374820554131607, + "tokens_seen": 782243840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038534603811434307, + "loss": 2.9792, + "theoretical_loss": 3.737450890822643, + "tokens_seen": 782309376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003853360080240722, + "loss": 3.082, + "theoretical_loss": 3.7374197295736877, + "tokens_seen": 782374912 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038532597793380143, + "loss": 2.8704, + "theoretical_loss": 3.737388571665656, + "tokens_seen": 782440448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038531594784353056, + "loss": 2.955, + "theoretical_loss": 3.73735741709791, + "tokens_seen": 782505984 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003853059177532598, + "loss": 2.955, + "theoretical_loss": 3.737326265869812, + "tokens_seen": 782571520 + }, + { + "epoch": 2.03, + "learning_rate": 0.000385295887662989, + "loss": 3.1636, + "theoretical_loss": 3.7372951179807234, + "tokens_seen": 782637056 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038528585757271816, + "loss": 2.8733, + "theoretical_loss": 3.737263973430009, + "tokens_seen": 782702592 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038527582748244734, + "loss": 3.039, + "theoretical_loss": 3.737232832217029, + "tokens_seen": 782768128 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003852657973921765, + "loss": 3.0045, + "theoretical_loss": 3.7372016943411484, + "tokens_seen": 782833664 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003852557673019057, + "loss": 2.9863, + "theoretical_loss": 3.737170559801729, + "tokens_seen": 782899200 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038524573721163493, + "loss": 3.0106, + "theoretical_loss": 3.7371394285981343, + "tokens_seen": 782964736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038523570712136406, + "loss": 2.7939, + "theoretical_loss": 3.737108300729728, + "tokens_seen": 783030272 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003852256770310933, + "loss": 2.9122, + "theoretical_loss": 3.737077176195873, + "tokens_seen": 783095808 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1266049, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.172506093978882, + "objective/train/theoretical_loss": 3.737061615179204, + "objective/train/tokens_used": 803588576, + "theoretical_loss": 3.737061615179204, + "tokens_seen": 783128576 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003852156469408225, + "loss": 3.0693, + "theoretical_loss": 3.7370460549959343, + "tokens_seen": 783161344 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038520561685055166, + "loss": 3.1001, + "theoretical_loss": 3.737014937129275, + "tokens_seen": 783226880 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038519558676028084, + "loss": 3.1113, + "theoretical_loss": 3.736983822595259, + "tokens_seen": 783292416 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038518555667001, + "loss": 2.8152, + "theoretical_loss": 3.7369527113932506, + "tokens_seen": 783357952 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003851755265797392, + "loss": 2.8756, + "theoretical_loss": 3.736921603522615, + "tokens_seen": 783423488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038516549648946844, + "loss": 2.8808, + "theoretical_loss": 3.7368904989827163, + "tokens_seen": 783489024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038515546639919756, + "loss": 2.8681, + "theoretical_loss": 3.7368593977729194, + "tokens_seen": 783554560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003851454363089268, + "loss": 3.0261, + "theoretical_loss": 3.73682829989259, + "tokens_seen": 783620096 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003851354062186559, + "loss": 3.0916, + "theoretical_loss": 3.736797205341092, + "tokens_seen": 783685632 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038512537612838516, + "loss": 2.9473, + "theoretical_loss": 3.7367661141177915, + "tokens_seen": 783751168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003851153460381144, + "loss": 3.0383, + "theoretical_loss": 3.736735026222054, + "tokens_seen": 783816704 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003851053159478435, + "loss": 2.8617, + "theoretical_loss": 3.7367039416532446, + "tokens_seen": 783882240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038509528585757276, + "loss": 2.9363, + "theoretical_loss": 3.7366728604107298, + "tokens_seen": 783947776 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850852557673019, + "loss": 3.0688, + "theoretical_loss": 3.7366417824938756, + "tokens_seen": 784013312 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850752256770311, + "loss": 2.8588, + "theoretical_loss": 3.736610707902048, + "tokens_seen": 784078848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850651955867603, + "loss": 2.9811, + "theoretical_loss": 3.7365796366346133, + "tokens_seen": 784144384 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850551654964895, + "loss": 3.0697, + "theoretical_loss": 3.7365485686909388, + "tokens_seen": 784209920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038504513540621866, + "loss": 3.0411, + "theoretical_loss": 3.73651750407039, + "tokens_seen": 784275456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850351053159479, + "loss": 3.0483, + "theoretical_loss": 3.7364864427723345, + "tokens_seen": 784340992 + }, + { + "epoch": 2.03, + "learning_rate": 0.000385025075225677, + "loss": 2.9781, + "theoretical_loss": 3.7364553847961393, + "tokens_seen": 784406528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038501504513540626, + "loss": 3.0811, + "theoretical_loss": 3.7364243301411717, + "tokens_seen": 784472064 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003850050150451354, + "loss": 2.755, + "theoretical_loss": 3.7363932788067995, + "tokens_seen": 784537600 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003849949849548646, + "loss": 2.9214, + "theoretical_loss": 3.7363622307923894, + "tokens_seen": 784603136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003849849548645938, + "loss": 2.9017, + "theoretical_loss": 3.7363311860973094, + "tokens_seen": 784668672 + }, + { + "epoch": 2.03, + "learning_rate": 0.000384974924774323, + "loss": 2.9282, + "theoretical_loss": 3.7363001447209285, + "tokens_seen": 784734208 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1267573, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1838457584381104, + "objective/train/theoretical_loss": 3.736284625277052, + "objective/train/tokens_used": 805226976, + "theoretical_loss": 3.736284625277052, + "tokens_seen": 784766976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038496489468405217, + "loss": 3.0732, + "theoretical_loss": 3.736269106662613, + "tokens_seen": 784799744 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038495486459378135, + "loss": 2.9024, + "theoretical_loss": 3.7362380719217327, + "tokens_seen": 784865280 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038494483450351053, + "loss": 2.9344, + "theoretical_loss": 3.736207040497656, + "tokens_seen": 784930816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038493480441323977, + "loss": 3.0111, + "theoretical_loss": 3.73617601238975, + "tokens_seen": 784996352 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003849247743229689, + "loss": 3.0967, + "theoretical_loss": 3.736144987597386, + "tokens_seen": 785061888 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038491474423269813, + "loss": 3.055, + "theoretical_loss": 3.7361139661199307, + "tokens_seen": 785127424 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038490471414242725, + "loss": 2.6934, + "theoretical_loss": 3.7360829479567546, + "tokens_seen": 785192960 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003848946840521565, + "loss": 3.0124, + "theoretical_loss": 3.736051933107226, + "tokens_seen": 785258496 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038488465396188567, + "loss": 2.9648, + "theoretical_loss": 3.7360209215707156, + "tokens_seen": 785324032 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038487462387161485, + "loss": 2.8998, + "theoretical_loss": 3.7359899133465917, + "tokens_seen": 785389568 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038486459378134403, + "loss": 3.0414, + "theoretical_loss": 3.7359589084342253, + "tokens_seen": 785455104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038485456369107327, + "loss": 2.9686, + "theoretical_loss": 3.7359279068329863, + "tokens_seen": 785520640 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003848445336008024, + "loss": 2.9139, + "theoretical_loss": 3.7358969085422444, + "tokens_seen": 785586176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038483450351053163, + "loss": 3.0045, + "theoretical_loss": 3.73586591356137, + "tokens_seen": 785651712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038482447342026076, + "loss": 3.0563, + "theoretical_loss": 3.7358349218897335, + "tokens_seen": 785717248 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038481444332999, + "loss": 3.0466, + "theoretical_loss": 3.7358039335267064, + "tokens_seen": 785782784 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003848044132397192, + "loss": 2.9234, + "theoretical_loss": 3.7357729484716584, + "tokens_seen": 785848320 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038479438314944836, + "loss": 2.9898, + "theoretical_loss": 3.735741966723962, + "tokens_seen": 785913856 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038478435305917754, + "loss": 2.9897, + "theoretical_loss": 3.7357109882829875, + "tokens_seen": 785979392 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003847743229689067, + "loss": 3.0644, + "theoretical_loss": 3.7356800131481065, + "tokens_seen": 786044928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003847642928786359, + "loss": 3.0393, + "theoretical_loss": 3.7356490413186902, + "tokens_seen": 786110464 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038475426278836513, + "loss": 2.9629, + "theoretical_loss": 3.735618072794111, + "tokens_seen": 786176000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038474423269809426, + "loss": 2.956, + "theoretical_loss": 3.73558710757374, + "tokens_seen": 786241536 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003847342026078235, + "loss": 2.902, + "theoretical_loss": 3.73555614565695, + "tokens_seen": 786307072 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003847241725175527, + "loss": 2.9753, + "theoretical_loss": 3.735525187043113, + "tokens_seen": 786372608 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1270619, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9801981449127197, + "objective/train/theoretical_loss": 3.7355097089746057, + "objective/train/tokens_used": 806865376, + "theoretical_loss": 3.7355097089746057, + "tokens_seen": 786405376 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038471414242728186, + "loss": 2.8512, + "theoretical_loss": 3.735494231731601, + "tokens_seen": 786438144 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038470411233701104, + "loss": 2.7896, + "theoretical_loss": 3.7354632797217877, + "tokens_seen": 786503680 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003846940822467402, + "loss": 2.9637, + "theoretical_loss": 3.735432331013045, + "tokens_seen": 786569216 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003846840521564694, + "loss": 3.0008, + "theoretical_loss": 3.735401385604746, + "tokens_seen": 786634752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038467402206619864, + "loss": 2.9516, + "theoretical_loss": 3.7353704434962633, + "tokens_seen": 786700288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038466399197592776, + "loss": 2.9568, + "theoretical_loss": 3.735339504686971, + "tokens_seen": 786765824 + }, + { + "epoch": 2.03, + "learning_rate": 0.000384653961885657, + "loss": 3.0286, + "theoretical_loss": 3.735308569176243, + "tokens_seen": 786831360 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003846439317953861, + "loss": 2.9056, + "theoretical_loss": 3.735277636963451, + "tokens_seen": 786896896 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038463390170511536, + "loss": 2.869, + "theoretical_loss": 3.7352467080479705, + "tokens_seen": 786962432 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038462387161484454, + "loss": 2.9396, + "theoretical_loss": 3.735215782429175, + "tokens_seen": 787027968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003846138415245737, + "loss": 2.8972, + "theoretical_loss": 3.735184860106439, + "tokens_seen": 787093504 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003846038114343029, + "loss": 3.0609, + "theoretical_loss": 3.735153941079136, + "tokens_seen": 787159040 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003845937813440321, + "loss": 2.9826, + "theoretical_loss": 3.735123025346641, + "tokens_seen": 787224576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038458375125376127, + "loss": 2.9031, + "theoretical_loss": 3.7350921129083283, + "tokens_seen": 787290112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003845737211634905, + "loss": 3.1421, + "theoretical_loss": 3.735061203763573, + "tokens_seen": 787355648 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038456369107321963, + "loss": 3.0151, + "theoretical_loss": 3.73503029791175, + "tokens_seen": 787421184 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038455366098294886, + "loss": 2.9981, + "theoretical_loss": 3.7349993953522347, + "tokens_seen": 787486720 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038454363089267805, + "loss": 2.8786, + "theoretical_loss": 3.7349684960844023, + "tokens_seen": 787552256 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038453360080240723, + "loss": 2.9752, + "theoretical_loss": 3.734937600107628, + "tokens_seen": 787617792 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003845235707121364, + "loss": 2.9385, + "theoretical_loss": 3.734906707421288, + "tokens_seen": 787683328 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003845135406218656, + "loss": 2.9188, + "theoretical_loss": 3.7348758180247574, + "tokens_seen": 787748864 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038450351053159477, + "loss": 3.0068, + "theoretical_loss": 3.734844931917413, + "tokens_seen": 787814400 + }, + { + "epoch": 2.03, + "learning_rate": 0.000384493480441324, + "loss": 2.8789, + "theoretical_loss": 3.73481404909863, + "tokens_seen": 787879936 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038448345035105313, + "loss": 3.0363, + "theoretical_loss": 3.7347831695677867, + "tokens_seen": 787945472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038447342026078237, + "loss": 2.9933, + "theoretical_loss": 3.7347522933242576, + "tokens_seen": 788011008 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1273240, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7447268962860107, + "objective/train/theoretical_loss": 3.734736856435041, + "objective/train/tokens_used": 808503776, + "theoretical_loss": 3.734736856435041, + "tokens_seen": 788043776 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003844633901705115, + "loss": 2.9351, + "theoretical_loss": 3.73472142036742, + "tokens_seen": 788076544 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038445336008024073, + "loss": 2.974, + "theoretical_loss": 3.734690550696651, + "tokens_seen": 788142080 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003844433299899699, + "loss": 3.054, + "theoretical_loss": 3.7346596843113273, + "tokens_seen": 788207616 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003844332998996991, + "loss": 2.9582, + "theoretical_loss": 3.7346288212108267, + "tokens_seen": 788273152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003844232698094283, + "loss": 2.9603, + "theoretical_loss": 3.734597961394526, + "tokens_seen": 788338688 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038441323971915745, + "loss": 2.9299, + "theoretical_loss": 3.734567104861803, + "tokens_seen": 788404224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038440320962888664, + "loss": 2.9442, + "theoretical_loss": 3.7345362516120355, + "tokens_seen": 788469760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038439317953861587, + "loss": 2.9787, + "theoretical_loss": 3.7345054016446015, + "tokens_seen": 788535296 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038438314944834505, + "loss": 2.924, + "theoretical_loss": 3.7344745549588785, + "tokens_seen": 788600832 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038437311935807423, + "loss": 2.9633, + "theoretical_loss": 3.7344437115542446, + "tokens_seen": 788666368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038436308926780347, + "loss": 3.008, + "theoretical_loss": 3.734412871430079, + "tokens_seen": 788731904 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003843530591775326, + "loss": 2.8334, + "theoretical_loss": 3.73438203458576, + "tokens_seen": 788797440 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038434302908726183, + "loss": 2.8741, + "theoretical_loss": 3.734351201020666, + "tokens_seen": 788862976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038433299899699096, + "loss": 2.7656, + "theoretical_loss": 3.734320370734176, + "tokens_seen": 788928512 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003843229689067202, + "loss": 2.8951, + "theoretical_loss": 3.7342895437256702, + "tokens_seen": 788994048 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003843129388164494, + "loss": 2.949, + "theoretical_loss": 3.734258719994526, + "tokens_seen": 789059584 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038430290872617856, + "loss": 2.9008, + "theoretical_loss": 3.7342278995401235, + "tokens_seen": 789125120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038429287863590774, + "loss": 3.1003, + "theoretical_loss": 3.7341970823618427, + "tokens_seen": 789190656 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003842828485456369, + "loss": 2.9478, + "theoretical_loss": 3.7341662684590626, + "tokens_seen": 789256192 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003842728184553661, + "loss": 2.9741, + "theoretical_loss": 3.734135457831164, + "tokens_seen": 789321728 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038426278836509533, + "loss": 2.8729, + "theoretical_loss": 3.7341046504775264, + "tokens_seen": 789387264 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038425275827482446, + "loss": 3.0097, + "theoretical_loss": 3.7340738463975303, + "tokens_seen": 789452800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003842427281845537, + "loss": 2.9499, + "theoretical_loss": 3.734043045590556, + "tokens_seen": 789518336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003842326980942829, + "loss": 2.7203, + "theoretical_loss": 3.7340122480559836, + "tokens_seen": 789583872 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038422266800401206, + "loss": 2.8271, + "theoretical_loss": 3.733981453793195, + "tokens_seen": 789649408 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1276035, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9641449451446533, + "objective/train/theoretical_loss": 3.7339660578885256, + "objective/train/tokens_used": 810142176, + "theoretical_loss": 3.7339660578885256, + "tokens_seen": 789682176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038421263791374124, + "loss": 3.0332, + "theoretical_loss": 3.73395066280157, + "tokens_seen": 789714944 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003842026078234704, + "loss": 2.9025, + "theoretical_loss": 3.733919875080491, + "tokens_seen": 789780480 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841925777331996, + "loss": 3.0017, + "theoretical_loss": 3.733889090629338, + "tokens_seen": 789846016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038418254764292884, + "loss": 2.953, + "theoretical_loss": 3.733858309447492, + "tokens_seen": 789911552 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038417251755265796, + "loss": 3.013, + "theoretical_loss": 3.7338275315343368, + "tokens_seen": 789977088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841624874623872, + "loss": 2.967, + "theoretical_loss": 3.7337967568892525, + "tokens_seen": 790042624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841524573721163, + "loss": 3.0505, + "theoretical_loss": 3.733765985511621, + "tokens_seen": 790108160 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038414242728184556, + "loss": 2.8474, + "theoretical_loss": 3.7337352174008256, + "tokens_seen": 790173696 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038413239719157474, + "loss": 2.8954, + "theoretical_loss": 3.733704452556247, + "tokens_seen": 790239232 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841223671013039, + "loss": 2.9012, + "theoretical_loss": 3.7336736909772688, + "tokens_seen": 790304768 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841123370110331, + "loss": 2.9116, + "theoretical_loss": 3.7336429326632734, + "tokens_seen": 790370304 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003841023069207623, + "loss": 3.0436, + "theoretical_loss": 3.733612177613643, + "tokens_seen": 790435840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038409227683049147, + "loss": 2.9616, + "theoretical_loss": 3.7335814258277606, + "tokens_seen": 790501376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003840822467402207, + "loss": 2.7551, + "theoretical_loss": 3.73355067730501, + "tokens_seen": 790566912 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038407221664994983, + "loss": 2.8699, + "theoretical_loss": 3.7335199320447745, + "tokens_seen": 790632448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038406218655967907, + "loss": 3.0839, + "theoretical_loss": 3.733489190046437, + "tokens_seen": 790697984 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038405215646940825, + "loss": 2.9435, + "theoretical_loss": 3.733458451309381, + "tokens_seen": 790763520 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038404212637913743, + "loss": 2.9045, + "theoretical_loss": 3.733427715832991, + "tokens_seen": 790829056 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003840320962888666, + "loss": 2.9099, + "theoretical_loss": 3.7333969836166503, + "tokens_seen": 790894592 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003840220661985958, + "loss": 2.9758, + "theoretical_loss": 3.7333662546597433, + "tokens_seen": 790960128 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038401203610832497, + "loss": 2.9408, + "theoretical_loss": 3.7333355289616543, + "tokens_seen": 791025664 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003840020060180542, + "loss": 2.9776, + "theoretical_loss": 3.733304806521767, + "tokens_seen": 791091200 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038399197592778333, + "loss": 2.7781, + "theoretical_loss": 3.733274087339468, + "tokens_seen": 791156736 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038398194583751257, + "loss": 2.9124, + "theoretical_loss": 3.73324337141414, + "tokens_seen": 791222272 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003839719157472417, + "loss": 3.0256, + "theoretical_loss": 3.7332126587451686, + "tokens_seen": 791287808 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1278846, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9166131019592285, + "objective/train/theoretical_loss": 3.7331973036316253, + "objective/train/tokens_used": 811780576, + "theoretical_loss": 3.7331973036316253, + "tokens_seen": 791320576 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038396188565697093, + "loss": 2.675, + "theoretical_loss": 3.7331819493319394, + "tokens_seen": 791353344 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003839518555667001, + "loss": 2.8083, + "theoretical_loss": 3.733151243173838, + "tokens_seen": 791418880 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003839418254764293, + "loss": 2.8876, + "theoretical_loss": 3.733120540270248, + "tokens_seen": 791484416 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003839317953861585, + "loss": 2.8991, + "theoretical_loss": 3.733089840620557, + "tokens_seen": 791549952 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038392176529588765, + "loss": 2.9703, + "theoretical_loss": 3.73305914422415, + "tokens_seen": 791615488 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038391173520561684, + "loss": 3.0335, + "theoretical_loss": 3.7330284510804126, + "tokens_seen": 791681024 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038390170511534607, + "loss": 2.986, + "theoretical_loss": 3.7329977611887317, + "tokens_seen": 791746560 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003838916750250752, + "loss": 2.9268, + "theoretical_loss": 3.7329670745484926, + "tokens_seen": 791812096 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038388164493480443, + "loss": 3.0169, + "theoretical_loss": 3.732936391159083, + "tokens_seen": 791877632 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003838716148445336, + "loss": 2.9888, + "theoretical_loss": 3.732905711019889, + "tokens_seen": 791943168 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003838615847542628, + "loss": 2.8898, + "theoretical_loss": 3.7328750341302968, + "tokens_seen": 792008704 + }, + { + "epoch": 2.03, + "learning_rate": 0.000383851554663992, + "loss": 2.9612, + "theoretical_loss": 3.732844360489694, + "tokens_seen": 792074240 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038384152457372116, + "loss": 2.9133, + "theoretical_loss": 3.7328136900974673, + "tokens_seen": 792139776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038383149448345034, + "loss": 2.9033, + "theoretical_loss": 3.7327830229530043, + "tokens_seen": 792205312 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003838214643931796, + "loss": 2.9538, + "theoretical_loss": 3.7327523590556924, + "tokens_seen": 792270848 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003838114343029087, + "loss": 2.8165, + "theoretical_loss": 3.7327216984049194, + "tokens_seen": 792336384 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038380140421263794, + "loss": 3.0301, + "theoretical_loss": 3.7326910410000727, + "tokens_seen": 792401920 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038379137412236706, + "loss": 2.9535, + "theoretical_loss": 3.732660386840541, + "tokens_seen": 792467456 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003837813440320963, + "loss": 2.89, + "theoretical_loss": 3.7326297359257117, + "tokens_seen": 792532992 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003837713139418255, + "loss": 2.8856, + "theoretical_loss": 3.7325990882549727, + "tokens_seen": 792598528 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038376128385155466, + "loss": 2.9938, + "theoretical_loss": 3.7325684438277134, + "tokens_seen": 792664064 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038375125376128384, + "loss": 3.0044, + "theoretical_loss": 3.7325378026433222, + "tokens_seen": 792729600 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003837412236710131, + "loss": 2.8378, + "theoretical_loss": 3.732507164701187, + "tokens_seen": 792795136 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003837311935807422, + "loss": 2.9753, + "theoretical_loss": 3.7324765300006977, + "tokens_seen": 792860672 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038372116349047144, + "loss": 3.0033, + "theoretical_loss": 3.7324458985412434, + "tokens_seen": 792926208 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1280247, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.983909845352173, + "objective/train/theoretical_loss": 3.7324305840267136, + "objective/train/tokens_used": 813418976, + "theoretical_loss": 3.7324305840267136, + "tokens_seen": 792958976 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038371113340020057, + "loss": 2.9835, + "theoretical_loss": 3.732415270322213, + "tokens_seen": 792991744 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003837011033099298, + "loss": 2.9945, + "theoretical_loss": 3.7323846453429965, + "tokens_seen": 793057280 + }, + { + "epoch": 2.03, + "learning_rate": 0.000383691073219659, + "loss": 2.993, + "theoretical_loss": 3.7323540236029826, + "tokens_seen": 793122816 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038368104312938816, + "loss": 2.726, + "theoretical_loss": 3.7323234051015617, + "tokens_seen": 793188352 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038367101303911735, + "loss": 2.9166, + "theoretical_loss": 3.732292789838124, + "tokens_seen": 793253888 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003836609829488465, + "loss": 2.8748, + "theoretical_loss": 3.732262177812059, + "tokens_seen": 793319424 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003836509528585757, + "loss": 2.9931, + "theoretical_loss": 3.7322315690227565, + "tokens_seen": 793384960 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038364092276830494, + "loss": 2.8771, + "theoretical_loss": 3.7322009634696083, + "tokens_seen": 793450496 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003836308926780341, + "loss": 2.9056, + "theoretical_loss": 3.7321703611520043, + "tokens_seen": 793516032 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003836208625877633, + "loss": 2.9935, + "theoretical_loss": 3.732139762069335, + "tokens_seen": 793581568 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003836108324974925, + "loss": 2.9498, + "theoretical_loss": 3.732109166220992, + "tokens_seen": 793647104 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038360080240722167, + "loss": 2.9269, + "theoretical_loss": 3.7320785736063655, + "tokens_seen": 793712640 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003835907723169509, + "loss": 2.9448, + "theoretical_loss": 3.732047984224848, + "tokens_seen": 793778176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038358074222668003, + "loss": 3.0644, + "theoretical_loss": 3.73201739807583, + "tokens_seen": 793843712 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038357071213640927, + "loss": 2.8882, + "theoretical_loss": 3.7319868151587032, + "tokens_seen": 793909248 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038356068204613845, + "loss": 2.9178, + "theoretical_loss": 3.7319562354728593, + "tokens_seen": 793974784 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038355065195586763, + "loss": 2.8831, + "theoretical_loss": 3.7319256590176906, + "tokens_seen": 794040320 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003835406218655968, + "loss": 2.9415, + "theoretical_loss": 3.7318950857925888, + "tokens_seen": 794105856 + }, + { + "epoch": 2.03, + "learning_rate": 0.000383530591775326, + "loss": 2.8434, + "theoretical_loss": 3.7318645157969463, + "tokens_seen": 794171392 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038352056168505517, + "loss": 3.0801, + "theoretical_loss": 3.7318339490301558, + "tokens_seen": 794236928 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003835105315947844, + "loss": 3.0045, + "theoretical_loss": 3.7318033854916095, + "tokens_seen": 794302464 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038350050150451353, + "loss": 2.7272, + "theoretical_loss": 3.7317728251807, + "tokens_seen": 794368000 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038349047141424277, + "loss": 3.0018, + "theoretical_loss": 3.731742268096821, + "tokens_seen": 794433536 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003834804413239719, + "loss": 2.778, + "theoretical_loss": 3.7317117142393643, + "tokens_seen": 794499072 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038347041123370113, + "loss": 3.0002, + "theoretical_loss": 3.7316811636077243, + "tokens_seen": 794564608 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1282668, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81245756149292, + "objective/train/theoretical_loss": 3.7316658895013957, + "objective/train/tokens_used": 815057376, + "theoretical_loss": 3.7316658895013957, + "tokens_seen": 794597376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003834603811434303, + "loss": 3.0461, + "theoretical_loss": 3.7316506162012937, + "tokens_seen": 794630144 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003834503510531595, + "loss": 3.0072, + "theoretical_loss": 3.7316200720194663, + "tokens_seen": 794695680 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003834403209628887, + "loss": 2.8739, + "theoretical_loss": 3.7315895310616365, + "tokens_seen": 794761216 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038343029087261786, + "loss": 2.9776, + "theoretical_loss": 3.731558993327197, + "tokens_seen": 794826752 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038342026078234704, + "loss": 2.9703, + "theoretical_loss": 3.731528458815543, + "tokens_seen": 794892288 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038341023069207627, + "loss": 3.1156, + "theoretical_loss": 3.7314979275260676, + "tokens_seen": 794957824 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003834002006018054, + "loss": 2.7221, + "theoretical_loss": 3.7314673994581655, + "tokens_seen": 795023360 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038339017051153463, + "loss": 3.0876, + "theoretical_loss": 3.731436874611232, + "tokens_seen": 795088896 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003833801404212638, + "loss": 2.8079, + "theoretical_loss": 3.731406352984661, + "tokens_seen": 795154432 + }, + { + "epoch": 2.03, + "learning_rate": 0.000383370110330993, + "loss": 3.0178, + "theoretical_loss": 3.7313758345778476, + "tokens_seen": 795219968 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003833600802407222, + "loss": 3.0667, + "theoretical_loss": 3.7313453193901873, + "tokens_seen": 795285504 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038335005015045136, + "loss": 2.883, + "theoretical_loss": 3.7313148074210742, + "tokens_seen": 795351040 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038334002006018054, + "loss": 2.9661, + "theoretical_loss": 3.7312842986699053, + "tokens_seen": 795416576 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003833299899699098, + "loss": 2.8606, + "theoretical_loss": 3.7312537931360743, + "tokens_seen": 795482112 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003833199598796389, + "loss": 3.1222, + "theoretical_loss": 3.731223290818978, + "tokens_seen": 795547648 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038330992978936814, + "loss": 2.9567, + "theoretical_loss": 3.731192791718012, + "tokens_seen": 795613184 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038329989969909726, + "loss": 2.9141, + "theoretical_loss": 3.731162295832573, + "tokens_seen": 795678720 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003832898696088265, + "loss": 2.9139, + "theoretical_loss": 3.731131803162056, + "tokens_seen": 795744256 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003832798395185557, + "loss": 2.8866, + "theoretical_loss": 3.7311013137058575, + "tokens_seen": 795809792 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038326980942828486, + "loss": 2.9978, + "theoretical_loss": 3.731070827463375, + "tokens_seen": 795875328 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038325977933801404, + "loss": 2.9947, + "theoretical_loss": 3.731040344434004, + "tokens_seen": 795940864 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003832497492477433, + "loss": 2.8054, + "theoretical_loss": 3.7310098646171426, + "tokens_seen": 796006400 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003832397191574724, + "loss": 2.872, + "theoretical_loss": 3.7309793880121864, + "tokens_seen": 796071936 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038322968906720164, + "loss": 2.887, + "theoretical_loss": 3.7309489146185335, + "tokens_seen": 796137472 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038321965897693077, + "loss": 3.0162, + "theoretical_loss": 3.730918444435581, + "tokens_seen": 796203008 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1285455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.927023410797119, + "objective/train/theoretical_loss": 3.7309032105479294, + "objective/train/tokens_used": 816695776, + "theoretical_loss": 3.7309032105479294, + "tokens_seen": 796235776 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038320962888666, + "loss": 2.9899, + "theoretical_loss": 3.730887977462727, + "tokens_seen": 796268544 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003831995987963892, + "loss": 2.9467, + "theoretical_loss": 3.730857513699368, + "tokens_seen": 796334080 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038318956870611836, + "loss": 2.9795, + "theoretical_loss": 3.7308270531449015, + "tokens_seen": 796399616 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038317953861584755, + "loss": 2.8775, + "theoretical_loss": 3.7307965957987275, + "tokens_seen": 796465152 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003831695085255767, + "loss": 2.9341, + "theoretical_loss": 3.7307661416602422, + "tokens_seen": 796530688 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003831594784353059, + "loss": 3.0145, + "theoretical_loss": 3.730735690728845, + "tokens_seen": 796596224 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038314944834503514, + "loss": 2.8074, + "theoretical_loss": 3.7307052430039334, + "tokens_seen": 796661760 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038313941825476427, + "loss": 2.8374, + "theoretical_loss": 3.730674798484907, + "tokens_seen": 796727296 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003831293881644935, + "loss": 2.9689, + "theoretical_loss": 3.730644357171164, + "tokens_seen": 796792832 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038311935807422263, + "loss": 3.0223, + "theoretical_loss": 3.7306139190621037, + "tokens_seen": 796858368 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038310932798395187, + "loss": 2.9071, + "theoretical_loss": 3.7305834841571253, + "tokens_seen": 796923904 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038309929789368105, + "loss": 2.9625, + "theoretical_loss": 3.7305530524556274, + "tokens_seen": 796989440 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038308926780341023, + "loss": 2.7905, + "theoretical_loss": 3.73052262395701, + "tokens_seen": 797054976 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003830792377131394, + "loss": 3.0477, + "theoretical_loss": 3.7304921986606727, + "tokens_seen": 797120512 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038306920762286865, + "loss": 2.9451, + "theoretical_loss": 3.730461776566015, + "tokens_seen": 797186048 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003830591775325978, + "loss": 2.8354, + "theoretical_loss": 3.730431357672437, + "tokens_seen": 797251584 + }, + { + "epoch": 2.03, + "learning_rate": 0.000383049147442327, + "loss": 2.7927, + "theoretical_loss": 3.730400941979338, + "tokens_seen": 797317120 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038303911735205614, + "loss": 3.0465, + "theoretical_loss": 3.7303705294861196, + "tokens_seen": 797382656 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038302908726178537, + "loss": 2.9879, + "theoretical_loss": 3.7303401201921815, + "tokens_seen": 797448192 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038301905717151455, + "loss": 2.8981, + "theoretical_loss": 3.730309714096924, + "tokens_seen": 797513728 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038300902708124373, + "loss": 2.9235, + "theoretical_loss": 3.730279311199748, + "tokens_seen": 797579264 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829989969909729, + "loss": 3.0732, + "theoretical_loss": 3.730248911500055, + "tokens_seen": 797644800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829889669007021, + "loss": 2.9689, + "theoretical_loss": 3.7302185149972455, + "tokens_seen": 797710336 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829789368104313, + "loss": 2.8988, + "theoretical_loss": 3.7301881216907207, + "tokens_seen": 797775872 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829689067201605, + "loss": 2.9175, + "theoretical_loss": 3.730157731579882, + "tokens_seen": 797841408 + }, + { + "epoch": 2.03, + "objective/train/docs_used": 1288022, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9262382984161377, + "objective/train/theoretical_loss": 3.7301425377226582, + "objective/train/tokens_used": 818334176, + "theoretical_loss": 3.7301425377226582, + "tokens_seen": 797874176 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038295887662988964, + "loss": 2.9252, + "theoretical_loss": 3.730127344664131, + "tokens_seen": 797906944 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829488465396189, + "loss": 2.7756, + "theoretical_loss": 3.73009696094287, + "tokens_seen": 797972480 + }, + { + "epoch": 2.03, + "learning_rate": 0.000382938816449348, + "loss": 2.9415, + "theoretical_loss": 3.730066580415499, + "tokens_seen": 798038016 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038292878635907724, + "loss": 3.0336, + "theoretical_loss": 3.730036203081422, + "tokens_seen": 798103552 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829187562688064, + "loss": 2.9481, + "theoretical_loss": 3.7300058289400404, + "tokens_seen": 798169088 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003829087261785356, + "loss": 3.0135, + "theoretical_loss": 3.7299754579907565, + "tokens_seen": 798234624 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003828986960882648, + "loss": 2.9615, + "theoretical_loss": 3.729945090232973, + "tokens_seen": 798300160 + }, + { + "epoch": 2.03, + "learning_rate": 0.000382888665997994, + "loss": 2.9095, + "theoretical_loss": 3.7299147256660925, + "tokens_seen": 798365696 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003828786359077232, + "loss": 3.0235, + "theoretical_loss": 3.729884364289517, + "tokens_seen": 798431232 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003828686058174524, + "loss": 2.949, + "theoretical_loss": 3.7298540061026513, + "tokens_seen": 798496768 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038285857572718156, + "loss": 3.0517, + "theoretical_loss": 3.7298236511048968, + "tokens_seen": 798562304 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038284854563691074, + "loss": 2.8269, + "theoretical_loss": 3.729793299295658, + "tokens_seen": 798627840 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038283851554664, + "loss": 2.8449, + "theoretical_loss": 3.7297629506743375, + "tokens_seen": 798693376 + }, + { + "epoch": 2.03, + "learning_rate": 0.0003828284854563691, + "loss": 2.8793, + "theoretical_loss": 3.729732605240339, + "tokens_seen": 798758912 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038281845536609834, + "loss": 3.0931, + "theoretical_loss": 3.729702262993067, + "tokens_seen": 798824448 + }, + { + "epoch": 2.03, + "learning_rate": 0.00038280842527582746, + "loss": 3.0164, + "theoretical_loss": 3.729671923931925, + "tokens_seen": 798889984 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827983951855567, + "loss": 2.9991, + "theoretical_loss": 3.7296415880563174, + "tokens_seen": 798955520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827883650952859, + "loss": 2.9905, + "theoretical_loss": 3.7296112553656475, + "tokens_seen": 799021056 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038277833500501506, + "loss": 2.7795, + "theoretical_loss": 3.729580925859321, + "tokens_seen": 799086592 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038276830491474424, + "loss": 2.8528, + "theoretical_loss": 3.7295505995367417, + "tokens_seen": 799152128 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827582748244735, + "loss": 2.9549, + "theoretical_loss": 3.729520276397314, + "tokens_seen": 799217664 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827482447342026, + "loss": 2.8904, + "theoretical_loss": 3.7294899564404442, + "tokens_seen": 799283200 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038273821464393184, + "loss": 2.8701, + "theoretical_loss": 3.729459639665536, + "tokens_seen": 799348736 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038272818455366097, + "loss": 2.982, + "theoretical_loss": 3.7294293260719953, + "tokens_seen": 799414272 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827181544633902, + "loss": 2.9708, + "theoretical_loss": 3.7293990156592276, + "tokens_seen": 799479808 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1290903, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9146595001220703, + "objective/train/theoretical_loss": 3.7293838616454478, + "objective/train/tokens_used": 819972576, + "theoretical_loss": 3.7293838616454478, + "tokens_seen": 799512576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003827081243731194, + "loss": 3.0528, + "theoretical_loss": 3.729368708426638, + "tokens_seen": 799545344 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038269809428284856, + "loss": 3.0358, + "theoretical_loss": 3.7293384043736317, + "tokens_seen": 799610880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038268806419257775, + "loss": 3.0417, + "theoretical_loss": 3.7293081034996165, + "tokens_seen": 799676416 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003826780341023069, + "loss": 3.0108, + "theoretical_loss": 3.7292778058039966, + "tokens_seen": 799741952 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003826680040120361, + "loss": 2.9706, + "theoretical_loss": 3.7292475112861787, + "tokens_seen": 799807488 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038265797392176534, + "loss": 2.8867, + "theoretical_loss": 3.729217219945569, + "tokens_seen": 799873024 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038264794383149447, + "loss": 2.8854, + "theoretical_loss": 3.729186931781575, + "tokens_seen": 799938560 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003826379137412237, + "loss": 2.9833, + "theoretical_loss": 3.7291566467936015, + "tokens_seen": 800004096 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038262788365095283, + "loss": 3.1349, + "theoretical_loss": 3.7291263649810573, + "tokens_seen": 800069632 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038261785356068207, + "loss": 2.885, + "theoretical_loss": 3.729096086343348, + "tokens_seen": 800135168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038260782347041125, + "loss": 3.0549, + "theoretical_loss": 3.7290658108798818, + "tokens_seen": 800200704 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038259779338014043, + "loss": 2.7935, + "theoretical_loss": 3.729035538590065, + "tokens_seen": 800266240 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003825877632898696, + "loss": 3.0642, + "theoretical_loss": 3.729005269473306, + "tokens_seen": 800331776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038257773319959885, + "loss": 2.9062, + "theoretical_loss": 3.728975003529011, + "tokens_seen": 800397312 + }, + { + "epoch": 2.04, + "learning_rate": 0.000382567703109328, + "loss": 2.9315, + "theoretical_loss": 3.7289447407565897, + "tokens_seen": 800462848 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003825576730190572, + "loss": 3.1198, + "theoretical_loss": 3.7289144811554484, + "tokens_seen": 800528384 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038254764292878634, + "loss": 3.0415, + "theoretical_loss": 3.728884224724996, + "tokens_seen": 800593920 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038253761283851557, + "loss": 2.7325, + "theoretical_loss": 3.7288539714646407, + "tokens_seen": 800659456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038252758274824475, + "loss": 2.8368, + "theoretical_loss": 3.7288237213737907, + "tokens_seen": 800724992 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038251755265797393, + "loss": 2.8, + "theoretical_loss": 3.7287934744518547, + "tokens_seen": 800790528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003825075225677031, + "loss": 2.9942, + "theoretical_loss": 3.7287632306982417, + "tokens_seen": 800856064 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824974924774323, + "loss": 2.9043, + "theoretical_loss": 3.72873299011236, + "tokens_seen": 800921600 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824874623871615, + "loss": 3.0344, + "theoretical_loss": 3.7287027526936187, + "tokens_seen": 800987136 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824774322968907, + "loss": 2.9519, + "theoretical_loss": 3.728672518441428, + "tokens_seen": 801052672 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038246740220661984, + "loss": 3.0339, + "theoretical_loss": 3.7286422873551963, + "tokens_seen": 801118208 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1293809, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2784979343414307, + "objective/train/theoretical_loss": 3.72862717299913, + "objective/train/tokens_used": 821610976, + "theoretical_loss": 3.72862717299913, + "tokens_seen": 801150976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824573721163491, + "loss": 3.0124, + "theoretical_loss": 3.728612059434333, + "tokens_seen": 801183744 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824473420260782, + "loss": 2.9918, + "theoretical_loss": 3.7285818346782484, + "tokens_seen": 801249280 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038243731193580744, + "loss": 2.9147, + "theoretical_loss": 3.728551613086352, + "tokens_seen": 801314816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824272818455366, + "loss": 2.9809, + "theoretical_loss": 3.728521394658054, + "tokens_seen": 801380352 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003824172517552658, + "loss": 3.0366, + "theoretical_loss": 3.7284911793927646, + "tokens_seen": 801445888 + }, + { + "epoch": 2.04, + "learning_rate": 0.000382407221664995, + "loss": 3.0521, + "theoretical_loss": 3.7284609672898936, + "tokens_seen": 801511424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823971915747242, + "loss": 2.9445, + "theoretical_loss": 3.728430758348852, + "tokens_seen": 801576960 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038238716148445334, + "loss": 2.8993, + "theoretical_loss": 3.7284005525690507, + "tokens_seen": 801642496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823771313941826, + "loss": 2.9592, + "theoretical_loss": 3.7283703499498992, + "tokens_seen": 801708032 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823671013039117, + "loss": 2.9619, + "theoretical_loss": 3.7283401504908102, + "tokens_seen": 801773568 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038235707121364094, + "loss": 2.9324, + "theoretical_loss": 3.7283099541911935, + "tokens_seen": 801839104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823470411233701, + "loss": 3.0338, + "theoretical_loss": 3.728279761050461, + "tokens_seen": 801904640 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823370110330993, + "loss": 2.7986, + "theoretical_loss": 3.728249571068024, + "tokens_seen": 801970176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003823269809428285, + "loss": 2.9717, + "theoretical_loss": 3.7282193842432934, + "tokens_seen": 802035712 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038231695085255766, + "loss": 2.9417, + "theoretical_loss": 3.728189200575682, + "tokens_seen": 802101248 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038230692076228685, + "loss": 2.9139, + "theoretical_loss": 3.7281590200646013, + "tokens_seen": 802166784 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003822968906720161, + "loss": 2.9482, + "theoretical_loss": 3.728128842709463, + "tokens_seen": 802232320 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003822868605817452, + "loss": 2.9391, + "theoretical_loss": 3.7280986685096797, + "tokens_seen": 802297856 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038227683049147444, + "loss": 3.0302, + "theoretical_loss": 3.7280684974646636, + "tokens_seen": 802363392 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038226680040120357, + "loss": 3.1728, + "theoretical_loss": 3.7280383295738275, + "tokens_seen": 802428928 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003822567703109328, + "loss": 2.8254, + "theoretical_loss": 3.728008164836584, + "tokens_seen": 802494464 + }, + { + "epoch": 2.04, + "learning_rate": 0.000382246740220662, + "loss": 2.9349, + "theoretical_loss": 3.7279780032523453, + "tokens_seen": 802560000 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038223671013039117, + "loss": 3.0607, + "theoretical_loss": 3.7279478448205254, + "tokens_seen": 802625536 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038222668004012035, + "loss": 2.9923, + "theoretical_loss": 3.7279176895405373, + "tokens_seen": 802691072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003822166499498496, + "loss": 2.8097, + "theoretical_loss": 3.7278875374117932, + "tokens_seen": 802756608 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1296393, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6451735496520996, + "objective/train/theoretical_loss": 3.727872462528955, + "objective/train/tokens_used": 823249376, + "theoretical_loss": 3.727872462528955, + "tokens_seen": 802789376 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003822066198595787, + "loss": 2.9592, + "theoretical_loss": 3.7278573884337076, + "tokens_seen": 802822144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038219658976930795, + "loss": 3.0363, + "theoretical_loss": 3.727827242605694, + "tokens_seen": 802887680 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003821865596790371, + "loss": 2.9513, + "theoretical_loss": 3.727797099927166, + "tokens_seen": 802953216 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003821765295887663, + "loss": 2.8667, + "theoretical_loss": 3.727766960397538, + "tokens_seen": 803018752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003821664994984955, + "loss": 2.8765, + "theoretical_loss": 3.7277368240162234, + "tokens_seen": 803084288 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038215646940822467, + "loss": 2.939, + "theoretical_loss": 3.727706690782637, + "tokens_seen": 803149824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038214643931795385, + "loss": 2.7638, + "theoretical_loss": 3.7276765606961924, + "tokens_seen": 803215360 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038213640922768303, + "loss": 2.97, + "theoretical_loss": 3.7276464337563047, + "tokens_seen": 803280896 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038212637913741227, + "loss": 2.8836, + "theoretical_loss": 3.7276163099623894, + "tokens_seen": 803346432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038211634904714145, + "loss": 2.8871, + "theoretical_loss": 3.72758618931386, + "tokens_seen": 803411968 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038210631895687063, + "loss": 3.0541, + "theoretical_loss": 3.727556071810133, + "tokens_seen": 803477504 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003820962888665998, + "loss": 2.9082, + "theoretical_loss": 3.727525957450622, + "tokens_seen": 803543040 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038208625877632905, + "loss": 2.8021, + "theoretical_loss": 3.727495846234743, + "tokens_seen": 803608576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003820762286860582, + "loss": 2.8969, + "theoretical_loss": 3.7274657381619125, + "tokens_seen": 803674112 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003820661985957874, + "loss": 2.9191, + "theoretical_loss": 3.7274356332315444, + "tokens_seen": 803739648 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038205616850551654, + "loss": 2.9736, + "theoretical_loss": 3.727405531443056, + "tokens_seen": 803805184 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038204613841524577, + "loss": 3.092, + "theoretical_loss": 3.7273754327958626, + "tokens_seen": 803870720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038203610832497495, + "loss": 2.9802, + "theoretical_loss": 3.72734533728938, + "tokens_seen": 803936256 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038202607823470413, + "loss": 3.0744, + "theoretical_loss": 3.727315244923026, + "tokens_seen": 804001792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003820160481444333, + "loss": 2.8084, + "theoretical_loss": 3.7272851556962148, + "tokens_seen": 804067328 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003820060180541625, + "loss": 2.9841, + "theoretical_loss": 3.727255069608365, + "tokens_seen": 804132864 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819959879638917, + "loss": 2.9206, + "theoretical_loss": 3.727224986658892, + "tokens_seen": 804198400 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819859578736209, + "loss": 2.8488, + "theoretical_loss": 3.727194906847213, + "tokens_seen": 804263936 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038197592778335004, + "loss": 3.0434, + "theoretical_loss": 3.727164830172746, + "tokens_seen": 804329472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819658976930793, + "loss": 3.1355, + "theoretical_loss": 3.727134756634907, + "tokens_seen": 804395008 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1299284, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.897062063217163, + "objective/train/theoretical_loss": 3.7271197210420413, + "objective/train/tokens_used": 824887776, + "theoretical_loss": 3.7271197210420413, + "tokens_seen": 804427776 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819558676028084, + "loss": 2.9151, + "theoretical_loss": 3.727104686233114, + "tokens_seen": 804460544 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038194583751253764, + "loss": 3.0683, + "theoretical_loss": 3.7270746189667845, + "tokens_seen": 804526080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819358074222668, + "loss": 2.9724, + "theoretical_loss": 3.727044554835336, + "tokens_seen": 804591616 + }, + { + "epoch": 2.04, + "learning_rate": 0.000381925777331996, + "loss": 2.8608, + "theoretical_loss": 3.727014493838187, + "tokens_seen": 804657152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819157472417252, + "loss": 3.0216, + "theoretical_loss": 3.7269844359747544, + "tokens_seen": 804722688 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003819057171514544, + "loss": 2.868, + "theoretical_loss": 3.7269543812444574, + "tokens_seen": 804788224 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038189568706118354, + "loss": 2.9929, + "theoretical_loss": 3.7269243296467134, + "tokens_seen": 804853760 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818856569709128, + "loss": 3.0233, + "theoretical_loss": 3.7268942811809413, + "tokens_seen": 804919296 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818756268806419, + "loss": 2.8312, + "theoretical_loss": 3.72686423584656, + "tokens_seen": 804984832 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038186559679037114, + "loss": 2.8713, + "theoretical_loss": 3.726834193642988, + "tokens_seen": 805050368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818555667001003, + "loss": 3.0084, + "theoretical_loss": 3.726804154569644, + "tokens_seen": 805115904 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818455366098295, + "loss": 2.9289, + "theoretical_loss": 3.726774118625948, + "tokens_seen": 805181440 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818355065195587, + "loss": 3.0082, + "theoretical_loss": 3.726744085811318, + "tokens_seen": 805246976 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038182547642928786, + "loss": 2.9824, + "theoretical_loss": 3.7267140561251737, + "tokens_seen": 805312512 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038181544633901705, + "loss": 2.8254, + "theoretical_loss": 3.7266840295669352, + "tokens_seen": 805378048 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003818054162487463, + "loss": 2.9097, + "theoretical_loss": 3.7266540061360223, + "tokens_seen": 805443584 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003817953861584754, + "loss": 2.974, + "theoretical_loss": 3.7266239858318544, + "tokens_seen": 805509120 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038178535606820464, + "loss": 2.9521, + "theoretical_loss": 3.7265939686538516, + "tokens_seen": 805574656 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038177532597793377, + "loss": 3.009, + "theoretical_loss": 3.726563954601434, + "tokens_seen": 805640192 + }, + { + "epoch": 2.04, + "learning_rate": 0.000381765295887663, + "loss": 2.9658, + "theoretical_loss": 3.726533943674022, + "tokens_seen": 805705728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003817552657973922, + "loss": 3.0246, + "theoretical_loss": 3.726503935871036, + "tokens_seen": 805771264 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038174523570712137, + "loss": 3.0655, + "theoretical_loss": 3.7264739311918964, + "tokens_seen": 805836800 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038173520561685055, + "loss": 3.1786, + "theoretical_loss": 3.726443929636025, + "tokens_seen": 805902336 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003817251755265798, + "loss": 2.8107, + "theoretical_loss": 3.726413931202842, + "tokens_seen": 805967872 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003817151454363089, + "loss": 3.0259, + "theoretical_loss": 3.7263839358917688, + "tokens_seen": 806033408 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1302019, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.464930295944214, + "objective/train/theoretical_loss": 3.726368939406842, + "objective/train/tokens_used": 826526176, + "theoretical_loss": 3.726368939406842, + "tokens_seen": 806066176 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038170511534603815, + "loss": 2.7724, + "theoretical_loss": 3.7263539437022257, + "tokens_seen": 806098944 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003816950852557673, + "loss": 2.9288, + "theoretical_loss": 3.726323954633635, + "tokens_seen": 806164480 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003816850551654965, + "loss": 3.0091, + "theoretical_loss": 3.726293968685419, + "tokens_seen": 806230016 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003816750250752257, + "loss": 2.9913, + "theoretical_loss": 3.7262639858569977, + "tokens_seen": 806295552 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038166499498495487, + "loss": 2.9671, + "theoretical_loss": 3.726234006147794, + "tokens_seen": 806361088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038165496489468405, + "loss": 2.9303, + "theoretical_loss": 3.7262040295572296, + "tokens_seen": 806426624 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038164493480441323, + "loss": 2.7184, + "theoretical_loss": 3.7261740560847274, + "tokens_seen": 806492160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003816349047141424, + "loss": 2.8468, + "theoretical_loss": 3.7261440857297083, + "tokens_seen": 806557696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038162487462387165, + "loss": 2.9108, + "theoretical_loss": 3.726114118491596, + "tokens_seen": 806623232 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003816148445336008, + "loss": 2.8577, + "theoretical_loss": 3.726084154369813, + "tokens_seen": 806688768 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038160481444333, + "loss": 2.8832, + "theoretical_loss": 3.7260541933637814, + "tokens_seen": 806754304 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038159478435305914, + "loss": 2.9959, + "theoretical_loss": 3.726024235472925, + "tokens_seen": 806819840 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003815847542627884, + "loss": 3.0693, + "theoretical_loss": 3.725994280696666, + "tokens_seen": 806885376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038157472417251756, + "loss": 2.8409, + "theoretical_loss": 3.725964329034429, + "tokens_seen": 806950912 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038156469408224674, + "loss": 3.0031, + "theoretical_loss": 3.725934380485636, + "tokens_seen": 807016448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003815546639919759, + "loss": 3.0116, + "theoretical_loss": 3.7259044350497112, + "tokens_seen": 807081984 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038154463390170515, + "loss": 2.9553, + "theoretical_loss": 3.7258744927260787, + "tokens_seen": 807147520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003815346038114343, + "loss": 2.9878, + "theoretical_loss": 3.725844553514161, + "tokens_seen": 807213056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003815245737211635, + "loss": 2.9658, + "theoretical_loss": 3.725814617413384, + "tokens_seen": 807278592 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038151454363089264, + "loss": 2.8218, + "theoretical_loss": 3.7257846844231706, + "tokens_seen": 807344128 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003815045135406219, + "loss": 3.0238, + "theoretical_loss": 3.7257547545429457, + "tokens_seen": 807409664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038149448345035106, + "loss": 2.8702, + "theoretical_loss": 3.725724827772133, + "tokens_seen": 807475200 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038148445336008024, + "loss": 3.0144, + "theoretical_loss": 3.725694904110158, + "tokens_seen": 807540736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003814744232698094, + "loss": 2.9044, + "theoretical_loss": 3.7256649835564453, + "tokens_seen": 807606272 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003814643931795386, + "loss": 2.9156, + "theoretical_loss": 3.72563506611042, + "tokens_seen": 807671808 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1303383, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.733099937438965, + "objective/train/theoretical_loss": 3.72562010855261, + "objective/train/tokens_used": 828164576, + "theoretical_loss": 3.72562010855261, + "tokens_seen": 807704576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003814543630892678, + "loss": 3.005, + "theoretical_loss": 3.7256051517715063, + "tokens_seen": 807737344 + }, + { + "epoch": 2.04, + "learning_rate": 0.000381444332998997, + "loss": 2.9792, + "theoretical_loss": 3.7255752405391305, + "tokens_seen": 807802880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038143430290872614, + "loss": 2.9613, + "theoretical_loss": 3.7255453324127172, + "tokens_seen": 807868416 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003814242728184554, + "loss": 2.9918, + "theoretical_loss": 3.725515427391693, + "tokens_seen": 807933952 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038141424272818456, + "loss": 2.9073, + "theoretical_loss": 3.7254855254754826, + "tokens_seen": 807999488 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038140421263791374, + "loss": 3.098, + "theoretical_loss": 3.725455626663512, + "tokens_seen": 808065024 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813941825476429, + "loss": 2.9808, + "theoretical_loss": 3.7254257309552075, + "tokens_seen": 808130560 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813841524573721, + "loss": 3.07, + "theoretical_loss": 3.7253958383499954, + "tokens_seen": 808196096 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038137412236710134, + "loss": 3.1417, + "theoretical_loss": 3.725365948847301, + "tokens_seen": 808261632 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813640922768305, + "loss": 3.0009, + "theoretical_loss": 3.725336062446553, + "tokens_seen": 808327168 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813540621865597, + "loss": 3.0433, + "theoretical_loss": 3.7253061791471755, + "tokens_seen": 808392704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813440320962889, + "loss": 2.831, + "theoretical_loss": 3.7252762989485966, + "tokens_seen": 808458240 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038133400200601806, + "loss": 2.9772, + "theoretical_loss": 3.7252464218502435, + "tokens_seen": 808523776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038132397191574725, + "loss": 2.9226, + "theoretical_loss": 3.7252165478515424, + "tokens_seen": 808589312 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813139418254765, + "loss": 2.9408, + "theoretical_loss": 3.725186676951921, + "tokens_seen": 808654848 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003813039117352056, + "loss": 2.8898, + "theoretical_loss": 3.725156809150806, + "tokens_seen": 808720384 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038129388164493484, + "loss": 2.8996, + "theoretical_loss": 3.7251269444476263, + "tokens_seen": 808785920 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038128385155466397, + "loss": 2.7322, + "theoretical_loss": 3.725097082841809, + "tokens_seen": 808851456 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003812738214643932, + "loss": 2.9517, + "theoretical_loss": 3.725067224332781, + "tokens_seen": 808916992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003812637913741224, + "loss": 2.824, + "theoretical_loss": 3.725037368919972, + "tokens_seen": 808982528 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038125376128385157, + "loss": 3.0148, + "theoretical_loss": 3.7250075166028083, + "tokens_seen": 809048064 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038124373119358075, + "loss": 2.9555, + "theoretical_loss": 3.7249776673807196, + "tokens_seen": 809113600 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038123370110331, + "loss": 2.9154, + "theoretical_loss": 3.724947821253134, + "tokens_seen": 809179136 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003812236710130391, + "loss": 3.0252, + "theoretical_loss": 3.72491797821948, + "tokens_seen": 809244672 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038121364092276835, + "loss": 2.8267, + "theoretical_loss": 3.7248881382791863, + "tokens_seen": 809310208 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1306217, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7984375953674316, + "objective/train/theoretical_loss": 3.7248732194688707, + "objective/train/tokens_used": 829802976, + "theoretical_loss": 3.7248732194688707, + "tokens_seen": 809342976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003812036108324975, + "loss": 2.9498, + "theoretical_loss": 3.7248583014316816, + "tokens_seen": 809375744 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003811935807422267, + "loss": 2.9015, + "theoretical_loss": 3.724828467676395, + "tokens_seen": 809441280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003811835506519559, + "loss": 2.959, + "theoretical_loss": 3.724798637012756, + "tokens_seen": 809506816 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038117352056168507, + "loss": 2.9689, + "theoretical_loss": 3.724768809440194, + "tokens_seen": 809572352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038116349047141425, + "loss": 2.9711, + "theoretical_loss": 3.7247389849581385, + "tokens_seen": 809637888 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038115346038114343, + "loss": 2.8482, + "theoretical_loss": 3.724709163566019, + "tokens_seen": 809703424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003811434302908726, + "loss": 3.0358, + "theoretical_loss": 3.724679345263265, + "tokens_seen": 809768960 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038113340020060185, + "loss": 2.9198, + "theoretical_loss": 3.724649530049308, + "tokens_seen": 809834496 + }, + { + "epoch": 2.04, + "learning_rate": 0.000381123370110331, + "loss": 2.9901, + "theoretical_loss": 3.7246197179235754, + "tokens_seen": 809900032 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003811133400200602, + "loss": 2.952, + "theoretical_loss": 3.7245899088855, + "tokens_seen": 809965568 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038110330992978934, + "loss": 2.8945, + "theoretical_loss": 3.724560102934511, + "tokens_seen": 810031104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003810932798395186, + "loss": 2.937, + "theoretical_loss": 3.7245303000700387, + "tokens_seen": 810096640 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038108324974924776, + "loss": 2.9868, + "theoretical_loss": 3.724500500291515, + "tokens_seen": 810162176 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038107321965897694, + "loss": 2.856, + "theoretical_loss": 3.72447070359837, + "tokens_seen": 810227712 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003810631895687061, + "loss": 2.9216, + "theoretical_loss": 3.724440909990034, + "tokens_seen": 810293248 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038105315947843535, + "loss": 3.0452, + "theoretical_loss": 3.72441111946594, + "tokens_seen": 810358784 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003810431293881645, + "loss": 3.1735, + "theoretical_loss": 3.724381332025518, + "tokens_seen": 810424320 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003810330992978937, + "loss": 2.9292, + "theoretical_loss": 3.7243515476681996, + "tokens_seen": 810489856 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038102306920762284, + "loss": 2.9203, + "theoretical_loss": 3.724321766393417, + "tokens_seen": 810555392 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003810130391173521, + "loss": 3.009, + "theoretical_loss": 3.7242919882006014, + "tokens_seen": 810620928 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038100300902708126, + "loss": 2.9216, + "theoretical_loss": 3.724262213089185, + "tokens_seen": 810686464 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038099297893681044, + "loss": 2.9341, + "theoretical_loss": 3.724232441058599, + "tokens_seen": 810752000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003809829488465396, + "loss": 2.9644, + "theoretical_loss": 3.7242026721082775, + "tokens_seen": 810817536 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003809729187562688, + "loss": 3.032, + "theoretical_loss": 3.7241729062376514, + "tokens_seen": 810883072 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380962888665998, + "loss": 2.8362, + "theoretical_loss": 3.724143143446154, + "tokens_seen": 810948608 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1309160, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.079915761947632, + "objective/train/theoretical_loss": 3.724128263204901, + "objective/train/tokens_used": 831441376, + "theoretical_loss": 3.724128263204901, + "tokens_seen": 810981376 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003809528585757272, + "loss": 2.9727, + "theoretical_loss": 3.724113383733217, + "tokens_seen": 811014144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038094282848545635, + "loss": 2.9536, + "theoretical_loss": 3.7240836270982745, + "tokens_seen": 811079680 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003809327983951856, + "loss": 2.9437, + "theoretical_loss": 3.7240538735407585, + "tokens_seen": 811145216 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038092276830491476, + "loss": 2.8351, + "theoretical_loss": 3.7240241230601026, + "tokens_seen": 811210752 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038091273821464394, + "loss": 2.9698, + "theoretical_loss": 3.72399437565574, + "tokens_seen": 811276288 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003809027081243731, + "loss": 2.836, + "theoretical_loss": 3.723964631327104, + "tokens_seen": 811341824 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808926780341023, + "loss": 3.0528, + "theoretical_loss": 3.7239348900736284, + "tokens_seen": 811407360 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808826479438315, + "loss": 2.9405, + "theoretical_loss": 3.723905151894747, + "tokens_seen": 811472896 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808726178535607, + "loss": 2.9538, + "theoretical_loss": 3.723875416789893, + "tokens_seen": 811538432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038086258776328985, + "loss": 2.9723, + "theoretical_loss": 3.7238456847585013, + "tokens_seen": 811603968 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808525576730191, + "loss": 2.963, + "theoretical_loss": 3.7238159558000055, + "tokens_seen": 811669504 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808425275827482, + "loss": 2.9309, + "theoretical_loss": 3.72378622991384, + "tokens_seen": 811735040 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038083249749247745, + "loss": 2.8295, + "theoretical_loss": 3.7237565070994396, + "tokens_seen": 811800576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808224674022066, + "loss": 2.9834, + "theoretical_loss": 3.723726787356239, + "tokens_seen": 811866112 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003808124373119358, + "loss": 3.0167, + "theoretical_loss": 3.7236970706836723, + "tokens_seen": 811931648 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380802407221665, + "loss": 2.9968, + "theoretical_loss": 3.723667357081175, + "tokens_seen": 811997184 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038079237713139417, + "loss": 3.0215, + "theoretical_loss": 3.723637646548182, + "tokens_seen": 812062720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038078234704112335, + "loss": 2.9797, + "theoretical_loss": 3.723607939084129, + "tokens_seen": 812128256 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003807723169508526, + "loss": 2.8964, + "theoretical_loss": 3.7235782346884507, + "tokens_seen": 812193792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003807622868605817, + "loss": 3.1421, + "theoretical_loss": 3.7235485333605833, + "tokens_seen": 812259328 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038075225677031095, + "loss": 2.9343, + "theoretical_loss": 3.7235188350999615, + "tokens_seen": 812324864 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038074222668004013, + "loss": 3.0297, + "theoretical_loss": 3.7234891399060217, + "tokens_seen": 812390400 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003807321965897693, + "loss": 2.9031, + "theoretical_loss": 3.7234594477782004, + "tokens_seen": 812455936 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003807221664994985, + "loss": 2.875, + "theoretical_loss": 3.723429758715933, + "tokens_seen": 812521472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003807121364092277, + "loss": 2.8678, + "theoretical_loss": 3.723400072718656, + "tokens_seen": 812587008 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1311485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9958274364471436, + "objective/train/theoretical_loss": 3.7233852308692126, + "objective/train/tokens_used": 833079776, + "theoretical_loss": 3.7233852308692126, + "tokens_seen": 812619776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038070210631895685, + "loss": 2.8663, + "theoretical_loss": 3.7233703897858055, + "tokens_seen": 812652544 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806920762286861, + "loss": 2.9082, + "theoretical_loss": 3.7233407099168185, + "tokens_seen": 812718080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806820461384152, + "loss": 2.8751, + "theoretical_loss": 3.7233110331111314, + "tokens_seen": 812783616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038067201604814445, + "loss": 2.8175, + "theoretical_loss": 3.7232813593681824, + "tokens_seen": 812849152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806619859578736, + "loss": 2.9433, + "theoretical_loss": 3.723251688687406, + "tokens_seen": 812914688 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806519558676028, + "loss": 3.0775, + "theoretical_loss": 3.7232220210682416, + "tokens_seen": 812980224 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380641925777332, + "loss": 2.9038, + "theoretical_loss": 3.7231923565101255, + "tokens_seen": 813045760 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806318956870612, + "loss": 2.7443, + "theoretical_loss": 3.723162695012495, + "tokens_seen": 813111296 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806218655967904, + "loss": 2.9075, + "theoretical_loss": 3.7231330365747883, + "tokens_seen": 813176832 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038061183550651954, + "loss": 2.9386, + "theoretical_loss": 3.7231033811964434, + "tokens_seen": 813242368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003806018054162488, + "loss": 2.943, + "theoretical_loss": 3.7230737288768974, + "tokens_seen": 813307904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038059177532597796, + "loss": 2.9877, + "theoretical_loss": 3.7230440796155886, + "tokens_seen": 813373440 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038058174523570714, + "loss": 2.9347, + "theoretical_loss": 3.7230144334119553, + "tokens_seen": 813438976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003805717151454363, + "loss": 2.8786, + "theoretical_loss": 3.722984790265436, + "tokens_seen": 813504512 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038056168505516555, + "loss": 3.0152, + "theoretical_loss": 3.722955150175469, + "tokens_seen": 813570048 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003805516549648947, + "loss": 2.927, + "theoretical_loss": 3.722925513141493, + "tokens_seen": 813635584 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003805416248746239, + "loss": 3.1195, + "theoretical_loss": 3.722895879162947, + "tokens_seen": 813701120 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038053159478435304, + "loss": 2.7875, + "theoretical_loss": 3.72286624823927, + "tokens_seen": 813766656 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003805215646940823, + "loss": 3.0204, + "theoretical_loss": 3.722836620369901, + "tokens_seen": 813832192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038051153460381146, + "loss": 2.8118, + "theoretical_loss": 3.7228069955542784, + "tokens_seen": 813897728 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038050150451354064, + "loss": 3.0511, + "theoretical_loss": 3.722777373791843, + "tokens_seen": 813963264 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804914744232698, + "loss": 2.9671, + "theoretical_loss": 3.7227477550820334, + "tokens_seen": 814028800 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380481444332999, + "loss": 2.9254, + "theoretical_loss": 3.72271813942429, + "tokens_seen": 814094336 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804714142427282, + "loss": 2.8056, + "theoretical_loss": 3.7226885268180516, + "tokens_seen": 814159872 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804613841524574, + "loss": 2.8504, + "theoretical_loss": 3.722658917262759, + "tokens_seen": 814225408 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1314346, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1620001792907715, + "objective/train/theoretical_loss": 3.722644113629043, + "objective/train/tokens_used": 834718176, + "theoretical_loss": 3.722644113629043, + "tokens_seen": 814258176 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038045135406218655, + "loss": 2.9939, + "theoretical_loss": 3.7226293107578523, + "tokens_seen": 814290944 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804413239719158, + "loss": 2.8793, + "theoretical_loss": 3.7225997073027717, + "tokens_seen": 814356480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038043129388164496, + "loss": 2.9504, + "theoretical_loss": 3.722570106896957, + "tokens_seen": 814422016 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038042126379137414, + "loss": 2.9808, + "theoretical_loss": 3.7225405095398503, + "tokens_seen": 814487552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804112337011033, + "loss": 3.0904, + "theoretical_loss": 3.722510915230891, + "tokens_seen": 814553088 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003804012036108325, + "loss": 2.8348, + "theoretical_loss": 3.72248132396952, + "tokens_seen": 814618624 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003803911735205617, + "loss": 3.0689, + "theoretical_loss": 3.722451735755179, + "tokens_seen": 814684160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003803811434302909, + "loss": 2.8648, + "theoretical_loss": 3.7224221505873087, + "tokens_seen": 814749696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038037111334002005, + "loss": 2.8894, + "theoretical_loss": 3.7223925684653505, + "tokens_seen": 814815232 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003803610832497493, + "loss": 3.0017, + "theoretical_loss": 3.7223629893887464, + "tokens_seen": 814880768 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003803510531594784, + "loss": 3.0279, + "theoretical_loss": 3.722333413356937, + "tokens_seen": 814946304 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038034102306920765, + "loss": 3.0495, + "theoretical_loss": 3.7223038403693645, + "tokens_seen": 815011840 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038033099297893683, + "loss": 2.9546, + "theoretical_loss": 3.7222742704254714, + "tokens_seen": 815077376 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380320962888666, + "loss": 2.9667, + "theoretical_loss": 3.722244703524699, + "tokens_seen": 815142912 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003803109327983952, + "loss": 3.0824, + "theoretical_loss": 3.72221513966649, + "tokens_seen": 815208448 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038030090270812437, + "loss": 2.8156, + "theoretical_loss": 3.7221855788502864, + "tokens_seen": 815273984 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038029087261785355, + "loss": 3.0238, + "theoretical_loss": 3.7221560210755307, + "tokens_seen": 815339520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802808425275828, + "loss": 2.8424, + "theoretical_loss": 3.7221264663416656, + "tokens_seen": 815405056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802708124373119, + "loss": 3.0048, + "theoretical_loss": 3.7220969146481337, + "tokens_seen": 815470592 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038026078234704115, + "loss": 2.9598, + "theoretical_loss": 3.7220673659943784, + "tokens_seen": 815536128 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038025075225677033, + "loss": 2.9977, + "theoretical_loss": 3.7220378203798425, + "tokens_seen": 815601664 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802407221664995, + "loss": 2.7807, + "theoretical_loss": 3.7220082778039694, + "tokens_seen": 815667200 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802306920762287, + "loss": 2.9292, + "theoretical_loss": 3.7219787382662024, + "tokens_seen": 815732736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802206619859579, + "loss": 2.9789, + "theoretical_loss": 3.7219492017659848, + "tokens_seen": 815798272 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038021063189568705, + "loss": 2.8689, + "theoretical_loss": 3.7219196683027604, + "tokens_seen": 815863808 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1317099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8458099365234375, + "objective/train/theoretical_loss": 3.721904902709847, + "objective/train/tokens_used": 836356576, + "theoretical_loss": 3.721904902709847, + "tokens_seen": 815896576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003802006018054163, + "loss": 3.0174, + "theoretical_loss": 3.7218901378759734, + "tokens_seen": 815929344 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003801905717151454, + "loss": 2.9823, + "theoretical_loss": 3.7218606104850673, + "tokens_seen": 815994880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038018054162487465, + "loss": 2.7996, + "theoretical_loss": 3.7218310861294865, + "tokens_seen": 816060416 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003801705115346038, + "loss": 2.9945, + "theoretical_loss": 3.7218015648086746, + "tokens_seen": 816125952 + }, + { + "epoch": 2.04, + "learning_rate": 0.000380160481444333, + "loss": 3.1156, + "theoretical_loss": 3.7217720465220765, + "tokens_seen": 816191488 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003801504513540622, + "loss": 3.0423, + "theoretical_loss": 3.721742531269137, + "tokens_seen": 816257024 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003801404212637914, + "loss": 2.9512, + "theoretical_loss": 3.7217130190493006, + "tokens_seen": 816322560 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038013039117352056, + "loss": 2.9472, + "theoretical_loss": 3.721683509862012, + "tokens_seen": 816388096 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038012036108324974, + "loss": 2.8541, + "theoretical_loss": 3.7216540037067163, + "tokens_seen": 816453632 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003801103309929789, + "loss": 3.0545, + "theoretical_loss": 3.721624500582858, + "tokens_seen": 816519168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038010030090270816, + "loss": 2.9527, + "theoretical_loss": 3.721595000489884, + "tokens_seen": 816584704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800902708124373, + "loss": 2.9193, + "theoretical_loss": 3.721565503427238, + "tokens_seen": 816650240 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800802407221665, + "loss": 3.0601, + "theoretical_loss": 3.7215360093943666, + "tokens_seen": 816715776 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800702106318957, + "loss": 2.91, + "theoretical_loss": 3.721506518390715, + "tokens_seen": 816781312 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800601805416249, + "loss": 3.0505, + "theoretical_loss": 3.721477030415729, + "tokens_seen": 816846848 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038005015045135406, + "loss": 2.9514, + "theoretical_loss": 3.7214475454688554, + "tokens_seen": 816912384 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038004012036108324, + "loss": 2.6996, + "theoretical_loss": 3.7214180635495397, + "tokens_seen": 816977920 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800300902708124, + "loss": 3.0906, + "theoretical_loss": 3.7213885846572277, + "tokens_seen": 817043456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038002006018054166, + "loss": 3.006, + "theoretical_loss": 3.7213591087913667, + "tokens_seen": 817108992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003800100300902708, + "loss": 2.9778, + "theoretical_loss": 3.7213296359514034, + "tokens_seen": 817174528 + }, + { + "epoch": 2.04, + "learning_rate": 0.00038, + "loss": 2.8266, + "theoretical_loss": 3.721300166136784, + "tokens_seen": 817240064 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037998996990972915, + "loss": 2.7698, + "theoretical_loss": 3.721270699346956, + "tokens_seen": 817305600 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003799799398194584, + "loss": 2.9501, + "theoretical_loss": 3.7212412355813655, + "tokens_seen": 817371136 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037996990972918756, + "loss": 3.0842, + "theoretical_loss": 3.7212117748394604, + "tokens_seen": 817436672 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037995987963891675, + "loss": 2.9274, + "theoretical_loss": 3.7211823171206873, + "tokens_seen": 817502208 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1320055, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1113200187683105, + "objective/train/theoretical_loss": 3.7211675893948026, + "objective/train/tokens_used": 837994976, + "theoretical_loss": 3.7211675893948026, + "tokens_seen": 817534976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003799498495486459, + "loss": 2.8487, + "theoretical_loss": 3.7211528624244945, + "tokens_seen": 817567744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037993981945837516, + "loss": 2.8474, + "theoretical_loss": 3.7211234107503293, + "tokens_seen": 817633280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003799297893681043, + "loss": 2.9404, + "theoretical_loss": 3.721093962097639, + "tokens_seen": 817698816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003799197592778335, + "loss": 3.1018, + "theoretical_loss": 3.7210645164658724, + "tokens_seen": 817764352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037990972918756265, + "loss": 3.0135, + "theoretical_loss": 3.721035073854477, + "tokens_seen": 817829888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003798996990972919, + "loss": 2.828, + "theoretical_loss": 3.7210056342629008, + "tokens_seen": 817895424 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037988966900702107, + "loss": 2.994, + "theoretical_loss": 3.7209761976905926, + "tokens_seen": 817960960 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037987963891675025, + "loss": 2.9548, + "theoretical_loss": 3.7209467641370004, + "tokens_seen": 818026496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003798696088264795, + "loss": 3.0814, + "theoretical_loss": 3.7209173336015735, + "tokens_seen": 818092032 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003798595787362086, + "loss": 2.8864, + "theoretical_loss": 3.72088790608376, + "tokens_seen": 818157568 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037984954864593785, + "loss": 3.0265, + "theoretical_loss": 3.720858481583009, + "tokens_seen": 818223104 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037983951855566703, + "loss": 3.011, + "theoretical_loss": 3.7208290600987697, + "tokens_seen": 818288640 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003798294884653962, + "loss": 2.7063, + "theoretical_loss": 3.7207996416304914, + "tokens_seen": 818354176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003798194583751254, + "loss": 2.8289, + "theoretical_loss": 3.7207702261776228, + "tokens_seen": 818419712 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037980942828485457, + "loss": 2.9424, + "theoretical_loss": 3.7207408137396145, + "tokens_seen": 818485248 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037979939819458375, + "loss": 3.055, + "theoretical_loss": 3.7207114043159146, + "tokens_seen": 818550784 + }, + { + "epoch": 2.04, + "learning_rate": 0.000379789368104313, + "loss": 2.9873, + "theoretical_loss": 3.7206819979059746, + "tokens_seen": 818616320 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003797793380140421, + "loss": 2.9334, + "theoretical_loss": 3.720652594509243, + "tokens_seen": 818681856 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037976930792377135, + "loss": 2.8493, + "theoretical_loss": 3.7206231941251713, + "tokens_seen": 818747392 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037975927783350053, + "loss": 2.9511, + "theoretical_loss": 3.7205937967532083, + "tokens_seen": 818812928 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003797492477432297, + "loss": 3.0119, + "theoretical_loss": 3.7205644023928053, + "tokens_seen": 818878464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003797392176529589, + "loss": 3.0179, + "theoretical_loss": 3.7205350110434123, + "tokens_seen": 818944000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003797291875626881, + "loss": 2.9675, + "theoretical_loss": 3.72050562270448, + "tokens_seen": 819009536 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037971915747241725, + "loss": 2.8702, + "theoretical_loss": 3.7204762373754594, + "tokens_seen": 819075072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003797091273821465, + "loss": 3.2149, + "theoretical_loss": 3.720446855055801, + "tokens_seen": 819140608 + }, + { + "debugging/Self-BLEU-5": 0.5589405917927349, + "debugging/distinct-1-grams": 0.7321674781129353, + "debugging/distinct-2-grams": 0.9263666029562126, + "debugging/entropy-1-grams": 5.9160653416865285, + "debugging/entropy-2-grams": 6.9658224093004755, + "debugging/length": 543.0666666666667, + "debugging/num_segments": 15, + "epoch": 2.04, + "objective/train/docs_used": 1323057, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5349152088165283, + "objective/train/theoretical_loss": 3.720432165024312, + "objective/train/tokens_used": 839633376, + "theoretical_loss": 3.720432165024312, + "tokens_seen": 819173376 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003796990972918756, + "loss": 2.9933, + "theoretical_loss": 3.7204174757449566, + "tokens_seen": 819206144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037968906720160485, + "loss": 3.0615, + "theoretical_loss": 3.7203880994423777, + "tokens_seen": 819271680 + }, + { + "epoch": 2.04, + "learning_rate": 0.000379679037111334, + "loss": 2.9333, + "theoretical_loss": 3.7203587261475137, + "tokens_seen": 819337216 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003796690070210632, + "loss": 2.7834, + "theoretical_loss": 3.7203293558598185, + "tokens_seen": 819402752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003796589769307924, + "loss": 3.138, + "theoretical_loss": 3.7202999885787422, + "tokens_seen": 819468288 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003796489468405216, + "loss": 2.8021, + "theoretical_loss": 3.720270624303737, + "tokens_seen": 819533824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037963891675025076, + "loss": 3.0045, + "theoretical_loss": 3.7202412630342554, + "tokens_seen": 819599360 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037962888665997994, + "loss": 2.9935, + "theoretical_loss": 3.7202119047697484, + "tokens_seen": 819664896 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003796188565697091, + "loss": 3.008, + "theoretical_loss": 3.720182549509669, + "tokens_seen": 819730432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037960882647943836, + "loss": 3.101, + "theoretical_loss": 3.7201531972534694, + "tokens_seen": 819795968 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795987963891675, + "loss": 2.8437, + "theoretical_loss": 3.7201238480006027, + "tokens_seen": 819861504 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795887662988967, + "loss": 3.1106, + "theoretical_loss": 3.7200945017505207, + "tokens_seen": 819927040 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795787362086259, + "loss": 3.0296, + "theoretical_loss": 3.7200651585026763, + "tokens_seen": 819992576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795687061183551, + "loss": 2.8639, + "theoretical_loss": 3.7200358182565227, + "tokens_seen": 820058112 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037955867602808426, + "loss": 2.9353, + "theoretical_loss": 3.720006481011513, + "tokens_seen": 820123648 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037954864593781344, + "loss": 2.9774, + "theoretical_loss": 3.7199771467671003, + "tokens_seen": 820189184 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795386158475426, + "loss": 2.7799, + "theoretical_loss": 3.7199478155227386, + "tokens_seen": 820254720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037952858575727186, + "loss": 2.9084, + "theoretical_loss": 3.71991848727788, + "tokens_seen": 820320256 + }, + { + "epoch": 2.04, + "learning_rate": 0.000379518555667001, + "loss": 3.0896, + "theoretical_loss": 3.7198891620319796, + "tokens_seen": 820385792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003795085255767302, + "loss": 2.7571, + "theoretical_loss": 3.719859839784491, + "tokens_seen": 820451328 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037949849548645935, + "loss": 2.9186, + "theoretical_loss": 3.719830520534867, + "tokens_seen": 820516864 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003794884653961886, + "loss": 2.9168, + "theoretical_loss": 3.719801204282563, + "tokens_seen": 820582400 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037947843530591776, + "loss": 3.0843, + "theoretical_loss": 3.7197718910270328, + "tokens_seen": 820647936 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037946840521564695, + "loss": 2.9359, + "theoretical_loss": 3.719742580767731, + "tokens_seen": 820713472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003794583751253761, + "loss": 2.854, + "theoretical_loss": 3.719713273504111, + "tokens_seen": 820779008 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1325039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9358785152435303, + "objective/train/theoretical_loss": 3.719698620995512, + "objective/train/tokens_used": 841271776, + "theoretical_loss": 3.719698620995512, + "tokens_seen": 820811776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037944834503510536, + "loss": 2.7585, + "theoretical_loss": 3.7196839692356285, + "tokens_seen": 820844544 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003794383149448345, + "loss": 2.8986, + "theoretical_loss": 3.7196546679617386, + "tokens_seen": 820910080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003794282848545637, + "loss": 2.989, + "theoretical_loss": 3.7196253696818955, + "tokens_seen": 820975616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037941825476429285, + "loss": 2.8311, + "theoretical_loss": 3.7195960743955547, + "tokens_seen": 821041152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003794082246740221, + "loss": 2.7651, + "theoretical_loss": 3.7195667821021714, + "tokens_seen": 821106688 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037939819458375127, + "loss": 3.0372, + "theoretical_loss": 3.7195374928012006, + "tokens_seen": 821172224 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037938816449348045, + "loss": 2.9778, + "theoretical_loss": 3.7195082064920983, + "tokens_seen": 821237760 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037937813440320963, + "loss": 3.0169, + "theoretical_loss": 3.7194789231743197, + "tokens_seen": 821303296 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003793681043129388, + "loss": 2.9938, + "theoretical_loss": 3.719449642847321, + "tokens_seen": 821368832 + }, + { + "epoch": 2.04, + "learning_rate": 0.000379358074222668, + "loss": 2.877, + "theoretical_loss": 3.719420365510558, + "tokens_seen": 821434368 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037934804413239723, + "loss": 2.9284, + "theoretical_loss": 3.7193910911634878, + "tokens_seen": 821499904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037933801404212635, + "loss": 3.0486, + "theoretical_loss": 3.719361819805565, + "tokens_seen": 821565440 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003793279839518556, + "loss": 3.024, + "theoretical_loss": 3.7193325514362465, + "tokens_seen": 821630976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003793179538615847, + "loss": 2.8726, + "theoretical_loss": 3.7193032860549886, + "tokens_seen": 821696512 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037930792377131395, + "loss": 2.9502, + "theoretical_loss": 3.7192740236612485, + "tokens_seen": 821762048 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037929789368104313, + "loss": 3.051, + "theoretical_loss": 3.7192447642544835, + "tokens_seen": 821827584 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003792878635907723, + "loss": 3.0522, + "theoretical_loss": 3.719215507834149, + "tokens_seen": 821893120 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003792778335005015, + "loss": 3.0152, + "theoretical_loss": 3.7191862543997036, + "tokens_seen": 821958656 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037926780341023073, + "loss": 2.8427, + "theoretical_loss": 3.7191570039506034, + "tokens_seen": 822024192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037925777331995986, + "loss": 2.8526, + "theoretical_loss": 3.719127756486307, + "tokens_seen": 822089728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003792477432296891, + "loss": 2.9275, + "theoretical_loss": 3.7190985120062705, + "tokens_seen": 822155264 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003792377131394182, + "loss": 2.9555, + "theoretical_loss": 3.7190692705099524, + "tokens_seen": 822220800 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037922768304914746, + "loss": 2.9889, + "theoretical_loss": 3.71904003199681, + "tokens_seen": 822286336 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037921765295887664, + "loss": 2.9245, + "theoretical_loss": 3.719010796466302, + "tokens_seen": 822351872 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003792076228686058, + "loss": 2.9935, + "theoretical_loss": 3.718981563917886, + "tokens_seen": 822417408 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1327913, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1506879329681396, + "objective/train/theoretical_loss": 3.718966948761793, + "objective/train/tokens_used": 842910176, + "theoretical_loss": 3.718966948761793, + "tokens_seen": 822450176 + }, + { + "epoch": 2.04, + "learning_rate": 0.000379197592778335, + "loss": 3.141, + "theoretical_loss": 3.7189523343510205, + "tokens_seen": 822482944 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791875626880642, + "loss": 2.9933, + "theoretical_loss": 3.718923107765163, + "tokens_seen": 822548480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037917753259779336, + "loss": 3.0492, + "theoretical_loss": 3.7188938841597734, + "tokens_seen": 822614016 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791675025075226, + "loss": 3.0634, + "theoretical_loss": 3.718864663534309, + "tokens_seen": 822679552 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791574724172517, + "loss": 3.0632, + "theoretical_loss": 3.718835445888229, + "tokens_seen": 822745088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037914744232698096, + "loss": 3.1283, + "theoretical_loss": 3.718806231220993, + "tokens_seen": 822810624 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791374122367101, + "loss": 3.083, + "theoretical_loss": 3.7187770195320593, + "tokens_seen": 822876160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791273821464393, + "loss": 2.9528, + "theoretical_loss": 3.7187478108208873, + "tokens_seen": 822941696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037911735205616856, + "loss": 2.9544, + "theoretical_loss": 3.718718605086937, + "tokens_seen": 823007232 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003791073219658977, + "loss": 3.1187, + "theoretical_loss": 3.718689402329667, + "tokens_seen": 823072768 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790972918756269, + "loss": 2.9935, + "theoretical_loss": 3.718660202548537, + "tokens_seen": 823138304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790872617853561, + "loss": 2.9843, + "theoretical_loss": 3.718631005743007, + "tokens_seen": 823203840 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790772316950853, + "loss": 2.8357, + "theoretical_loss": 3.7186018119125377, + "tokens_seen": 823269376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037906720160481446, + "loss": 2.8568, + "theoretical_loss": 3.7185726210565875, + "tokens_seen": 823334912 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037905717151454364, + "loss": 2.9919, + "theoretical_loss": 3.7185434331746183, + "tokens_seen": 823400448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790471414242728, + "loss": 2.9219, + "theoretical_loss": 3.718514248266089, + "tokens_seen": 823465984 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037903711133400206, + "loss": 2.9073, + "theoretical_loss": 3.7184850663304614, + "tokens_seen": 823531520 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790270812437312, + "loss": 2.9629, + "theoretical_loss": 3.718455887367195, + "tokens_seen": 823597056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003790170511534604, + "loss": 3.1082, + "theoretical_loss": 3.718426711375751, + "tokens_seen": 823662592 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037900702106318955, + "loss": 2.9026, + "theoretical_loss": 3.7183975383555903, + "tokens_seen": 823728128 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003789969909729188, + "loss": 2.9886, + "theoretical_loss": 3.7183683683061743, + "tokens_seen": 823793664 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037898696088264796, + "loss": 3.028, + "theoretical_loss": 3.7183392012269634, + "tokens_seen": 823859200 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037897693079237715, + "loss": 3.0805, + "theoretical_loss": 3.71831003711742, + "tokens_seen": 823924736 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003789669007021063, + "loss": 2.9537, + "theoretical_loss": 3.7182808759770047, + "tokens_seen": 823990272 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037895687061183556, + "loss": 2.7521, + "theoretical_loss": 3.7182517178051793, + "tokens_seen": 824055808 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1329241, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.038194417953491, + "objective/train/theoretical_loss": 3.7182371398323193, + "objective/train/tokens_used": 844548576, + "theoretical_loss": 3.7182371398323193, + "tokens_seen": 824088576 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003789468405215647, + "loss": 3.0504, + "theoretical_loss": 3.7182225626014054, + "tokens_seen": 824121344 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003789368104312939, + "loss": 3.049, + "theoretical_loss": 3.7181934103651457, + "tokens_seen": 824186880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037892678034102305, + "loss": 3.1002, + "theoretical_loss": 3.7181642610958607, + "tokens_seen": 824252416 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003789167502507523, + "loss": 3.0392, + "theoretical_loss": 3.7181351147930144, + "tokens_seen": 824317952 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037890672016048147, + "loss": 2.8477, + "theoretical_loss": 3.718105971456068, + "tokens_seen": 824383488 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037889669007021065, + "loss": 2.9995, + "theoretical_loss": 3.7180768310844834, + "tokens_seen": 824449024 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037888665997993983, + "loss": 2.8928, + "theoretical_loss": 3.7180476936777245, + "tokens_seen": 824514560 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378876629889669, + "loss": 2.9093, + "theoretical_loss": 3.7180185592352535, + "tokens_seen": 824580096 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003788665997993982, + "loss": 2.9566, + "theoretical_loss": 3.7179894277565335, + "tokens_seen": 824645632 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037885656970912743, + "loss": 2.9566, + "theoretical_loss": 3.717960299241027, + "tokens_seen": 824711168 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037884653961885655, + "loss": 3.0311, + "theoretical_loss": 3.7179311736881973, + "tokens_seen": 824776704 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003788365095285858, + "loss": 2.9251, + "theoretical_loss": 3.7179020510975076, + "tokens_seen": 824842240 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003788264794383149, + "loss": 3.0744, + "theoretical_loss": 3.717872931468422, + "tokens_seen": 824907776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037881644934804415, + "loss": 2.8408, + "theoretical_loss": 3.717843814800403, + "tokens_seen": 824973312 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037880641925777333, + "loss": 3.0144, + "theoretical_loss": 3.717814701092915, + "tokens_seen": 825038848 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003787963891675025, + "loss": 2.8344, + "theoretical_loss": 3.7177855903454224, + "tokens_seen": 825104384 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003787863590772317, + "loss": 2.9674, + "theoretical_loss": 3.717756482557388, + "tokens_seen": 825169920 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037877632898696093, + "loss": 2.8686, + "theoretical_loss": 3.717727377728276, + "tokens_seen": 825235456 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037876629889669006, + "loss": 3.0514, + "theoretical_loss": 3.7176982758575523, + "tokens_seen": 825300992 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003787562688064193, + "loss": 3.1245, + "theoretical_loss": 3.7176691769446792, + "tokens_seen": 825366528 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003787462387161484, + "loss": 3.0125, + "theoretical_loss": 3.7176400809891224, + "tokens_seen": 825432064 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037873620862587766, + "loss": 2.8039, + "theoretical_loss": 3.717610987990346, + "tokens_seen": 825497600 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037872617853560684, + "loss": 2.9263, + "theoretical_loss": 3.7175818979478157, + "tokens_seen": 825563136 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378716148445336, + "loss": 2.8789, + "theoretical_loss": 3.7175528108609956, + "tokens_seen": 825628672 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003787061183550652, + "loss": 2.8253, + "theoretical_loss": 3.7175237267293517, + "tokens_seen": 825694208 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1332234, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.269618272781372, + "objective/train/theoretical_loss": 3.7175091857715534, + "objective/train/tokens_used": 846186976, + "theoretical_loss": 3.7175091857715534, + "tokens_seen": 825726976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786960882647944, + "loss": 3.0534, + "theoretical_loss": 3.717494645552348, + "tokens_seen": 825759744 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037868605817452356, + "loss": 2.798, + "theoretical_loss": 3.717465567329451, + "tokens_seen": 825825280 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786760280842528, + "loss": 3.0784, + "theoretical_loss": 3.7174364920601257, + "tokens_seen": 825890816 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786659979939819, + "loss": 2.8531, + "theoretical_loss": 3.717407419743838, + "tokens_seen": 825956352 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037865596790371116, + "loss": 3.02, + "theoretical_loss": 3.717378350380053, + "tokens_seen": 826021888 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786459378134403, + "loss": 2.8923, + "theoretical_loss": 3.717349283968238, + "tokens_seen": 826087424 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786359077231695, + "loss": 3.0098, + "theoretical_loss": 3.7173202205078577, + "tokens_seen": 826152960 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786258776328987, + "loss": 3.0759, + "theoretical_loss": 3.7172911599983793, + "tokens_seen": 826218496 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003786158475426279, + "loss": 2.9935, + "theoretical_loss": 3.7172621024392685, + "tokens_seen": 826284032 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037860581745235706, + "loss": 3.0093, + "theoretical_loss": 3.7172330478299918, + "tokens_seen": 826349568 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003785957873620863, + "loss": 2.8422, + "theoretical_loss": 3.7172039961700163, + "tokens_seen": 826415104 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003785857572718154, + "loss": 2.8071, + "theoretical_loss": 3.7171749474588083, + "tokens_seen": 826480640 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037857572718154466, + "loss": 2.9727, + "theoretical_loss": 3.7171459016958353, + "tokens_seen": 826546176 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003785656970912738, + "loss": 3.0419, + "theoretical_loss": 3.717116858880564, + "tokens_seen": 826611712 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378555667001003, + "loss": 2.9058, + "theoretical_loss": 3.7170878190124617, + "tokens_seen": 826677248 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003785456369107322, + "loss": 2.9465, + "theoretical_loss": 3.7170587820909953, + "tokens_seen": 826742784 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003785356068204614, + "loss": 3.0014, + "theoretical_loss": 3.7170297481156322, + "tokens_seen": 826808320 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037852557673019057, + "loss": 3.0521, + "theoretical_loss": 3.717000717085841, + "tokens_seen": 826873856 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037851554663991975, + "loss": 2.9319, + "theoretical_loss": 3.716971689001088, + "tokens_seen": 826939392 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037850551654964893, + "loss": 2.8296, + "theoretical_loss": 3.716942663860843, + "tokens_seen": 827004928 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037849548645937816, + "loss": 2.9573, + "theoretical_loss": 3.716913641664572, + "tokens_seen": 827070464 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003784854563691073, + "loss": 2.9665, + "theoretical_loss": 3.7168846224117447, + "tokens_seen": 827136000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003784754262788365, + "loss": 2.9519, + "theoretical_loss": 3.7168556061018285, + "tokens_seen": 827201536 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037846539618856565, + "loss": 2.8506, + "theoretical_loss": 3.716826592734292, + "tokens_seen": 827267072 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003784553660982949, + "loss": 2.9177, + "theoretical_loss": 3.7167975823086037, + "tokens_seen": 827332608 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1334883, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.138312578201294, + "objective/train/theoretical_loss": 3.7167830781987874, + "objective/train/tokens_used": 847825376, + "theoretical_loss": 3.7167830781987874, + "tokens_seen": 827365376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037844533600802407, + "loss": 3.0579, + "theoretical_loss": 3.716768574824233, + "tokens_seen": 827398144 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037843530591775325, + "loss": 2.9091, + "theoretical_loss": 3.716739570280648, + "tokens_seen": 827463680 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037842527582748243, + "loss": 2.9263, + "theoretical_loss": 3.7167105686773176, + "tokens_seen": 827529216 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037841524573721167, + "loss": 2.8262, + "theoretical_loss": 3.716681570013712, + "tokens_seen": 827594752 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003784052156469408, + "loss": 2.7854, + "theoretical_loss": 3.716652574289299, + "tokens_seen": 827660288 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037839518555667003, + "loss": 3.155, + "theoretical_loss": 3.7166235815035487, + "tokens_seen": 827725824 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037838515546639916, + "loss": 3.0099, + "theoretical_loss": 3.716594591655931, + "tokens_seen": 827791360 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003783751253761284, + "loss": 3.079, + "theoretical_loss": 3.716565604745915, + "tokens_seen": 827856896 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037836509528585763, + "loss": 2.9829, + "theoretical_loss": 3.7165366207729704, + "tokens_seen": 827922432 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037835506519558675, + "loss": 2.9312, + "theoretical_loss": 3.716507639736568, + "tokens_seen": 827987968 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378345035105316, + "loss": 2.8867, + "theoretical_loss": 3.7164786616361773, + "tokens_seen": 828053504 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003783350050150451, + "loss": 2.8576, + "theoretical_loss": 3.716449686471268, + "tokens_seen": 828119040 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037832497492477435, + "loss": 2.9318, + "theoretical_loss": 3.7164207142413117, + "tokens_seen": 828184576 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037831494483450353, + "loss": 3.0801, + "theoretical_loss": 3.7163917449457777, + "tokens_seen": 828250112 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003783049147442327, + "loss": 3.0279, + "theoretical_loss": 3.7163627785841373, + "tokens_seen": 828315648 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782948846539619, + "loss": 3.0361, + "theoretical_loss": 3.716333815155861, + "tokens_seen": 828381184 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037828485456369113, + "loss": 3.0504, + "theoretical_loss": 3.7163048546604203, + "tokens_seen": 828446720 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037827482447342026, + "loss": 2.9399, + "theoretical_loss": 3.716275897097286, + "tokens_seen": 828512256 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782647943831495, + "loss": 2.7848, + "theoretical_loss": 3.7162469424659283, + "tokens_seen": 828577792 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782547642928786, + "loss": 2.878, + "theoretical_loss": 3.7162179907658195, + "tokens_seen": 828643328 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037824473420260786, + "loss": 2.8738, + "theoretical_loss": 3.716189041996431, + "tokens_seen": 828708864 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037823470411233704, + "loss": 2.9893, + "theoretical_loss": 3.7161600961572345, + "tokens_seen": 828774400 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782246740220662, + "loss": 2.809, + "theoretical_loss": 3.716131153247701, + "tokens_seen": 828839936 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782146439317954, + "loss": 2.907, + "theoretical_loss": 3.716102213267303, + "tokens_seen": 828905472 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003782046138415246, + "loss": 2.9389, + "theoretical_loss": 3.7160732762155124, + "tokens_seen": 828971008 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1337648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.128053665161133, + "objective/train/theoretical_loss": 3.71605880878768, + "objective/train/tokens_used": 849463776, + "theoretical_loss": 3.71605880878768, + "tokens_seen": 829003776 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037819458375125376, + "loss": 2.9154, + "theoretical_loss": 3.7160443420918012, + "tokens_seen": 829036544 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378184553660983, + "loss": 2.9547, + "theoretical_loss": 3.7160154108956425, + "tokens_seen": 829102080 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781745235707121, + "loss": 2.8789, + "theoretical_loss": 3.715986482626507, + "tokens_seen": 829167616 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037816449348044136, + "loss": 2.8141, + "theoretical_loss": 3.7159575572838692, + "tokens_seen": 829233152 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781544633901705, + "loss": 2.9558, + "theoretical_loss": 3.7159286348672005, + "tokens_seen": 829298688 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781444332998997, + "loss": 2.8659, + "theoretical_loss": 3.715899715375974, + "tokens_seen": 829364224 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781344032096289, + "loss": 3.0167, + "theoretical_loss": 3.7158707988096626, + "tokens_seen": 829429760 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781243731193581, + "loss": 2.9541, + "theoretical_loss": 3.71584188516774, + "tokens_seen": 829495296 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037811434302908726, + "loss": 2.8813, + "theoretical_loss": 3.715812974449679, + "tokens_seen": 829560832 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003781043129388165, + "loss": 2.9204, + "theoretical_loss": 3.7157840666549524, + "tokens_seen": 829626368 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003780942828485456, + "loss": 2.8101, + "theoretical_loss": 3.7157551617830347, + "tokens_seen": 829691904 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037808425275827486, + "loss": 2.8767, + "theoretical_loss": 3.7157262598333993, + "tokens_seen": 829757440 + }, + { + "epoch": 2.04, + "learning_rate": 0.000378074222668004, + "loss": 3.0004, + "theoretical_loss": 3.7156973608055193, + "tokens_seen": 829822976 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003780641925777332, + "loss": 3.0523, + "theoretical_loss": 3.7156684646988696, + "tokens_seen": 829888512 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003780541624874624, + "loss": 2.9167, + "theoretical_loss": 3.7156395715129236, + "tokens_seen": 829954048 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003780441323971916, + "loss": 2.8245, + "theoretical_loss": 3.7156106812471563, + "tokens_seen": 830019584 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037803410230692077, + "loss": 2.8898, + "theoretical_loss": 3.7155817939010407, + "tokens_seen": 830085120 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037802407221664995, + "loss": 3.0068, + "theoretical_loss": 3.7155529094740523, + "tokens_seen": 830150656 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037801404212637913, + "loss": 2.9578, + "theoretical_loss": 3.7155240279656647, + "tokens_seen": 830216192 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037800401203610836, + "loss": 2.937, + "theoretical_loss": 3.715495149375354, + "tokens_seen": 830281728 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003779939819458375, + "loss": 2.9159, + "theoretical_loss": 3.7154662737025945, + "tokens_seen": 830347264 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037798395185556673, + "loss": 2.8549, + "theoretical_loss": 3.7154374009468607, + "tokens_seen": 830412800 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037797392176529585, + "loss": 2.9825, + "theoretical_loss": 3.7154085311076286, + "tokens_seen": 830478336 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003779638916750251, + "loss": 3.1019, + "theoretical_loss": 3.715379664184373, + "tokens_seen": 830543872 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037795386158475427, + "loss": 2.9248, + "theoretical_loss": 3.7153508001765685, + "tokens_seen": 830609408 + }, + { + "epoch": 2.04, + "objective/train/docs_used": 1340604, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.797072172164917, + "objective/train/theoretical_loss": 3.7153363692657972, + "objective/train/tokens_used": 851102176, + "theoretical_loss": 3.7153363692657972, + "tokens_seen": 830642176 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037794383149448345, + "loss": 2.8612, + "theoretical_loss": 3.7153219390836925, + "tokens_seen": 830674944 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037793380140421263, + "loss": 2.8398, + "theoretical_loss": 3.7152930809052194, + "tokens_seen": 830740480 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037792377131394187, + "loss": 3.0089, + "theoretical_loss": 3.7152642256406248, + "tokens_seen": 830806016 + }, + { + "epoch": 2.04, + "learning_rate": 0.000377913741223671, + "loss": 2.9851, + "theoretical_loss": 3.715235373289386, + "tokens_seen": 830871552 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037790371113340023, + "loss": 2.8786, + "theoretical_loss": 3.715206523850978, + "tokens_seen": 830937088 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037789368104312936, + "loss": 2.9365, + "theoretical_loss": 3.715177677324877, + "tokens_seen": 831002624 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003778836509528586, + "loss": 2.9432, + "theoretical_loss": 3.71514883371056, + "tokens_seen": 831068160 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003778736208625878, + "loss": 2.9567, + "theoretical_loss": 3.7151199930075025, + "tokens_seen": 831133696 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037786359077231695, + "loss": 2.7329, + "theoretical_loss": 3.7150911552151826, + "tokens_seen": 831199232 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037785356068204614, + "loss": 3.0187, + "theoretical_loss": 3.715062320333076, + "tokens_seen": 831264768 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003778435305917753, + "loss": 3.0527, + "theoretical_loss": 3.7150334883606604, + "tokens_seen": 831330304 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003778335005015045, + "loss": 2.9418, + "theoretical_loss": 3.7150046592974113, + "tokens_seen": 831395840 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037782347041123373, + "loss": 2.9784, + "theoretical_loss": 3.714975833142808, + "tokens_seen": 831461376 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037781344032096286, + "loss": 2.9652, + "theoretical_loss": 3.7149470098963255, + "tokens_seen": 831526912 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003778034102306921, + "loss": 2.9334, + "theoretical_loss": 3.7149181895574435, + "tokens_seen": 831592448 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003777933801404213, + "loss": 2.9152, + "theoretical_loss": 3.714889372125638, + "tokens_seen": 831657984 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037778335005015046, + "loss": 2.9472, + "theoretical_loss": 3.7148605576003875, + "tokens_seen": 831723520 + }, + { + "epoch": 2.04, + "learning_rate": 0.00037777331995987964, + "loss": 3.096, + "theoretical_loss": 3.7148317459811695, + "tokens_seen": 831789056 + }, + { + "epoch": 2.04, + "learning_rate": 0.0003777632898696088, + "loss": 2.9976, + "theoretical_loss": 3.7148029372674625, + "tokens_seen": 831854592 + }, + { + "epoch": 2.05, + "learning_rate": 0.000377753259779338, + "loss": 3.0863, + "theoretical_loss": 3.7147741314587437, + "tokens_seen": 831920128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037774322968906724, + "loss": 2.8157, + "theoretical_loss": 3.7147453285544922, + "tokens_seen": 831985664 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037773319959879636, + "loss": 2.932, + "theoretical_loss": 3.714716528554186, + "tokens_seen": 832051200 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003777231695085256, + "loss": 2.9594, + "theoretical_loss": 3.7146877314573037, + "tokens_seen": 832116736 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003777131394182547, + "loss": 2.9396, + "theoretical_loss": 3.714658937263324, + "tokens_seen": 832182272 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037770310932798396, + "loss": 2.7627, + "theoretical_loss": 3.714630145971726, + "tokens_seen": 832247808 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1343375, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.059628963470459, + "objective/train/theoretical_loss": 3.714615751414157, + "objective/train/tokens_used": 852740576, + "theoretical_loss": 3.714615751414157, + "tokens_seen": 832280576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037769307923771314, + "loss": 2.947, + "theoretical_loss": 3.7146013575819876, + "tokens_seen": 832313344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776830491474423, + "loss": 2.8411, + "theoretical_loss": 3.714572572093589, + "tokens_seen": 832378880 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776730190571715, + "loss": 2.9614, + "theoretical_loss": 3.714543789506009, + "tokens_seen": 832444416 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776629889669007, + "loss": 2.9356, + "theoretical_loss": 3.7145150098187267, + "tokens_seen": 832509952 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037765295887662987, + "loss": 2.9317, + "theoretical_loss": 3.714486233031222, + "tokens_seen": 832575488 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776429287863591, + "loss": 2.8723, + "theoretical_loss": 3.7144574591429746, + "tokens_seen": 832641024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037763289869608823, + "loss": 3.0384, + "theoretical_loss": 3.7144286881534634, + "tokens_seen": 832706560 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037762286860581746, + "loss": 2.915, + "theoretical_loss": 3.714399920062169, + "tokens_seen": 832772096 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776128385155467, + "loss": 2.9533, + "theoretical_loss": 3.714371154868571, + "tokens_seen": 832837632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003776028084252758, + "loss": 3.1375, + "theoretical_loss": 3.71434239257215, + "tokens_seen": 832903168 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037759277833500506, + "loss": 2.853, + "theoretical_loss": 3.714313633172386, + "tokens_seen": 832968704 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003775827482447342, + "loss": 2.9037, + "theoretical_loss": 3.714284876668759, + "tokens_seen": 833034240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003775727181544634, + "loss": 2.9961, + "theoretical_loss": 3.7142561230607507, + "tokens_seen": 833099776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003775626880641926, + "loss": 3.005, + "theoretical_loss": 3.7142273723478403, + "tokens_seen": 833165312 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003775526579739218, + "loss": 2.974, + "theoretical_loss": 3.7141986245295096, + "tokens_seen": 833230848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037754262788365097, + "loss": 2.987, + "theoretical_loss": 3.7141698796052394, + "tokens_seen": 833296384 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037753259779338015, + "loss": 2.9271, + "theoretical_loss": 3.7141411375745106, + "tokens_seen": 833361920 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037752256770310933, + "loss": 2.8975, + "theoretical_loss": 3.7141123984368045, + "tokens_seen": 833427456 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037751253761283857, + "loss": 3.0864, + "theoretical_loss": 3.7140836621916025, + "tokens_seen": 833492992 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003775025075225677, + "loss": 3.0509, + "theoretical_loss": 3.7140549288383866, + "tokens_seen": 833558528 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037749247743229693, + "loss": 2.8988, + "theoretical_loss": 3.714026198376637, + "tokens_seen": 833624064 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037748244734202605, + "loss": 2.9081, + "theoretical_loss": 3.7139974708058365, + "tokens_seen": 833689600 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003774724172517553, + "loss": 2.8653, + "theoretical_loss": 3.7139687461254667, + "tokens_seen": 833755136 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037746238716148447, + "loss": 3.0936, + "theoretical_loss": 3.71394002433501, + "tokens_seen": 833820672 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037745235707121365, + "loss": 3.0689, + "theoretical_loss": 3.713911305433948, + "tokens_seen": 833886208 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1346191, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8802638053894043, + "objective/train/theoretical_loss": 3.7138969470667784, + "objective/train/tokens_used": 854378976, + "theoretical_loss": 3.7138969470667784, + "tokens_seen": 833918976 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037744232698094283, + "loss": 2.8997, + "theoretical_loss": 3.7138825894217633, + "tokens_seen": 833951744 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037743229689067207, + "loss": 2.738, + "theoretical_loss": 3.713853876297938, + "tokens_seen": 834017280 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003774222668004012, + "loss": 2.983, + "theoretical_loss": 3.7138251660619552, + "tokens_seen": 834082816 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037741223671013043, + "loss": 2.9235, + "theoretical_loss": 3.713796458713297, + "tokens_seen": 834148352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037740220661985956, + "loss": 2.8375, + "theoretical_loss": 3.713767754251447, + "tokens_seen": 834213888 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003773921765295888, + "loss": 2.9312, + "theoretical_loss": 3.713739052675887, + "tokens_seen": 834279424 + }, + { + "epoch": 2.05, + "learning_rate": 0.000377382146439318, + "loss": 2.8639, + "theoretical_loss": 3.7137103539861007, + "tokens_seen": 834344960 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037737211634904715, + "loss": 2.9775, + "theoretical_loss": 3.7136816581815717, + "tokens_seen": 834410496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037736208625877634, + "loss": 2.9521, + "theoretical_loss": 3.713652965261783, + "tokens_seen": 834476032 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003773520561685055, + "loss": 3.0673, + "theoretical_loss": 3.713624275226218, + "tokens_seen": 834541568 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003773420260782347, + "loss": 2.9721, + "theoretical_loss": 3.71359558807436, + "tokens_seen": 834607104 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037733199598796393, + "loss": 2.858, + "theoretical_loss": 3.7135669038056935, + "tokens_seen": 834672640 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037732196589769306, + "loss": 2.8079, + "theoretical_loss": 3.713538222419702, + "tokens_seen": 834738176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003773119358074223, + "loss": 2.8586, + "theoretical_loss": 3.7135095439158694, + "tokens_seen": 834803712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003773019057171515, + "loss": 2.9862, + "theoretical_loss": 3.71348086829368, + "tokens_seen": 834869248 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037729187562688066, + "loss": 2.9618, + "theoretical_loss": 3.713452195552618, + "tokens_seen": 834934784 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037728184553660984, + "loss": 2.9548, + "theoretical_loss": 3.7134235256921677, + "tokens_seen": 835000320 + }, + { + "epoch": 2.05, + "learning_rate": 0.000377271815446339, + "loss": 2.8704, + "theoretical_loss": 3.7133948587118137, + "tokens_seen": 835065856 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003772617853560682, + "loss": 3.0234, + "theoretical_loss": 3.7133661946110412, + "tokens_seen": 835131392 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037725175526579744, + "loss": 2.9572, + "theoretical_loss": 3.7133375333893346, + "tokens_seen": 835196928 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037724172517552656, + "loss": 3.2162, + "theoretical_loss": 3.7133088750461782, + "tokens_seen": 835262464 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003772316950852558, + "loss": 3.0557, + "theoretical_loss": 3.7132802195810584, + "tokens_seen": 835328000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003772216649949849, + "loss": 3.0599, + "theoretical_loss": 3.7132515669934594, + "tokens_seen": 835393536 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037721163490471416, + "loss": 2.9397, + "theoretical_loss": 3.7132229172828666, + "tokens_seen": 835459072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037720160481444334, + "loss": 2.7298, + "theoretical_loss": 3.713194270448766, + "tokens_seen": 835524608 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1347568, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2410809993743896, + "objective/train/theoretical_loss": 3.7131799481102394, + "objective/train/tokens_used": 856017376, + "theoretical_loss": 3.7131799481102394, + "tokens_seen": 835557376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003771915747241725, + "loss": 2.9836, + "theoretical_loss": 3.713165626490643, + "tokens_seen": 835590144 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003771815446339017, + "loss": 3.0793, + "theoretical_loss": 3.713136985407983, + "tokens_seen": 835655680 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003771715145436309, + "loss": 3.0448, + "theoretical_loss": 3.713108347200272, + "tokens_seen": 835721216 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037716148445336007, + "loss": 2.8756, + "theoretical_loss": 3.713079711866997, + "tokens_seen": 835786752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003771514543630893, + "loss": 3.0501, + "theoretical_loss": 3.7130510794076423, + "tokens_seen": 835852288 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037714142427281843, + "loss": 2.8659, + "theoretical_loss": 3.713022449821696, + "tokens_seen": 835917824 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037713139418254766, + "loss": 2.8564, + "theoretical_loss": 3.7129938231086426, + "tokens_seen": 835983360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037712136409227685, + "loss": 2.9918, + "theoretical_loss": 3.7129651992679706, + "tokens_seen": 836048896 + }, + { + "epoch": 2.05, + "learning_rate": 0.000377111334002006, + "loss": 2.9655, + "theoretical_loss": 3.712936578299165, + "tokens_seen": 836114432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003771013039117352, + "loss": 3.013, + "theoretical_loss": 3.7129079602017137, + "tokens_seen": 836179968 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770912738214644, + "loss": 2.8627, + "theoretical_loss": 3.712879344975103, + "tokens_seen": 836245504 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037708124373119357, + "loss": 2.8739, + "theoretical_loss": 3.7128507326188203, + "tokens_seen": 836311040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770712136409228, + "loss": 2.9199, + "theoretical_loss": 3.712822123132353, + "tokens_seen": 836376576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037706118355065193, + "loss": 2.8861, + "theoretical_loss": 3.7127935165151875, + "tokens_seen": 836442112 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037705115346038117, + "loss": 2.9777, + "theoretical_loss": 3.7127649127668123, + "tokens_seen": 836507648 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770411233701103, + "loss": 2.9916, + "theoretical_loss": 3.7127363118867147, + "tokens_seen": 836573184 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037703109327983953, + "loss": 2.8854, + "theoretical_loss": 3.712707713874382, + "tokens_seen": 836638720 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770210631895687, + "loss": 2.9999, + "theoretical_loss": 3.7126791187293025, + "tokens_seen": 836704256 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770110330992979, + "loss": 2.8769, + "theoretical_loss": 3.7126505264509637, + "tokens_seen": 836769792 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003770010030090271, + "loss": 3.0711, + "theoretical_loss": 3.712621937038854, + "tokens_seen": 836835328 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037699097291875625, + "loss": 2.9074, + "theoretical_loss": 3.7125933504924618, + "tokens_seen": 836900864 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037698094282848544, + "loss": 3.039, + "theoretical_loss": 3.7125647668112753, + "tokens_seen": 836966400 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037697091273821467, + "loss": 2.8437, + "theoretical_loss": 3.712536185994783, + "tokens_seen": 837031936 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003769608826479438, + "loss": 3.006, + "theoretical_loss": 3.7125076080424737, + "tokens_seen": 837097472 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037695085255767303, + "loss": 3.0813, + "theoretical_loss": 3.712479032953836, + "tokens_seen": 837163008 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1350397, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5820860862731934, + "objective/train/theoretical_loss": 3.712464746483234, + "objective/train/tokens_used": 857655776, + "theoretical_loss": 3.712464746483234, + "tokens_seen": 837195776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003769408224674022, + "loss": 2.8919, + "theoretical_loss": 3.712450460728359, + "tokens_seen": 837228544 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003769307923771314, + "loss": 2.8846, + "theoretical_loss": 3.7124218913655316, + "tokens_seen": 837294080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003769207622868606, + "loss": 3.003, + "theoretical_loss": 3.712393324864842, + "tokens_seen": 837359616 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037691073219658976, + "loss": 2.9868, + "theoretical_loss": 3.712364761225781, + "tokens_seen": 837425152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037690070210631894, + "loss": 2.9539, + "theoretical_loss": 3.7123362004478375, + "tokens_seen": 837490688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768906720160482, + "loss": 2.8648, + "theoretical_loss": 3.7123076425305013, + "tokens_seen": 837556224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768806419257773, + "loss": 2.9829, + "theoretical_loss": 3.7122790874732616, + "tokens_seen": 837621760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037687061183550654, + "loss": 2.8668, + "theoretical_loss": 3.7122505352756083, + "tokens_seen": 837687296 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768605817452357, + "loss": 2.8686, + "theoretical_loss": 3.712221985937031, + "tokens_seen": 837752832 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768505516549649, + "loss": 2.9973, + "theoretical_loss": 3.712193439457021, + "tokens_seen": 837818368 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037684052156469413, + "loss": 3.0119, + "theoretical_loss": 3.712164895835068, + "tokens_seen": 837883904 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037683049147442326, + "loss": 2.8861, + "theoretical_loss": 3.712136355070661, + "tokens_seen": 837949440 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768204613841525, + "loss": 2.7144, + "theoretical_loss": 3.712107817163292, + "tokens_seen": 838014976 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003768104312938817, + "loss": 2.9095, + "theoretical_loss": 3.7120792821124513, + "tokens_seen": 838080512 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037680040120361086, + "loss": 2.9749, + "theoretical_loss": 3.71205074991763, + "tokens_seen": 838146048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037679037111334004, + "loss": 2.984, + "theoretical_loss": 3.712022220578317, + "tokens_seen": 838211584 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003767803410230692, + "loss": 2.7701, + "theoretical_loss": 3.711993694094006, + "tokens_seen": 838277120 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003767703109327984, + "loss": 3.0536, + "theoretical_loss": 3.711965170464186, + "tokens_seen": 838342656 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037676028084252764, + "loss": 3.0025, + "theoretical_loss": 3.71193664968835, + "tokens_seen": 838408192 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037675025075225676, + "loss": 3.0306, + "theoretical_loss": 3.7119081317659877, + "tokens_seen": 838473728 + }, + { + "epoch": 2.05, + "learning_rate": 0.000376740220661986, + "loss": 2.8948, + "theoretical_loss": 3.711879616696592, + "tokens_seen": 838539264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003767301905717151, + "loss": 2.9668, + "theoretical_loss": 3.7118511044796536, + "tokens_seen": 838604800 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037672016048144436, + "loss": 3.0166, + "theoretical_loss": 3.711822595114665, + "tokens_seen": 838670336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037671013039117354, + "loss": 2.9041, + "theoretical_loss": 3.711794088601118, + "tokens_seen": 838735872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003767001003009027, + "loss": 2.9524, + "theoretical_loss": 3.7117655849385036, + "tokens_seen": 838801408 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1352942, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0593631267547607, + "objective/train/theoretical_loss": 3.711751334176138, + "objective/train/tokens_used": 859294176, + "theoretical_loss": 3.711751334176138, + "tokens_seen": 838834176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003766900702106319, + "loss": 2.893, + "theoretical_loss": 3.711737084126315, + "tokens_seen": 838866944 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003766800401203611, + "loss": 3.0134, + "theoretical_loss": 3.711708586164044, + "tokens_seen": 838932480 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037667001003009027, + "loss": 3.0029, + "theoretical_loss": 3.711680091051184, + "tokens_seen": 838998016 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003766599799398195, + "loss": 2.9643, + "theoretical_loss": 3.7116515987872267, + "tokens_seen": 839063552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037664994984954863, + "loss": 3.0039, + "theoretical_loss": 3.711623109371665, + "tokens_seen": 839129088 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037663991975927786, + "loss": 3.0266, + "theoretical_loss": 3.711594622803991, + "tokens_seen": 839194624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037662988966900705, + "loss": 2.9811, + "theoretical_loss": 3.7115661390836987, + "tokens_seen": 839260160 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003766198595787362, + "loss": 2.9767, + "theoretical_loss": 3.7115376582102804, + "tokens_seen": 839325696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003766098294884654, + "loss": 3.0336, + "theoretical_loss": 3.71150918018323, + "tokens_seen": 839391232 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003765997993981946, + "loss": 2.8144, + "theoretical_loss": 3.7114807050020406, + "tokens_seen": 839456768 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037658976930792377, + "loss": 3.0877, + "theoretical_loss": 3.7114522326662054, + "tokens_seen": 839522304 + }, + { + "epoch": 2.05, + "learning_rate": 0.000376579739217653, + "loss": 3.0548, + "theoretical_loss": 3.711423763175218, + "tokens_seen": 839587840 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037656970912738213, + "loss": 3.0091, + "theoretical_loss": 3.711395296528573, + "tokens_seen": 839653376 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037655967903711137, + "loss": 2.9389, + "theoretical_loss": 3.711366832725763, + "tokens_seen": 839718912 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003765496489468405, + "loss": 2.9545, + "theoretical_loss": 3.7113383717662822, + "tokens_seen": 839784448 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037653961885656973, + "loss": 2.9257, + "theoretical_loss": 3.7113099136496257, + "tokens_seen": 839849984 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003765295887662989, + "loss": 2.9514, + "theoretical_loss": 3.7112814583752867, + "tokens_seen": 839915520 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003765195586760281, + "loss": 3.0864, + "theoretical_loss": 3.71125300594276, + "tokens_seen": 839981056 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003765095285857573, + "loss": 2.9783, + "theoretical_loss": 3.7112245563515405, + "tokens_seen": 840046592 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037649949849548645, + "loss": 3.0816, + "theoretical_loss": 3.711196109601122, + "tokens_seen": 840112128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037648946840521564, + "loss": 2.928, + "theoretical_loss": 3.7111676656909998, + "tokens_seen": 840177664 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037647943831494487, + "loss": 2.9278, + "theoretical_loss": 3.7111392246206685, + "tokens_seen": 840243200 + }, + { + "epoch": 2.05, + "learning_rate": 0.000376469408224674, + "loss": 2.853, + "theoretical_loss": 3.7111107863896233, + "tokens_seen": 840308736 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037645937813440323, + "loss": 2.8958, + "theoretical_loss": 3.7110823509973594, + "tokens_seen": 840374272 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003764493480441324, + "loss": 3.0872, + "theoretical_loss": 3.7110539184433717, + "tokens_seen": 840439808 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1355828, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1608541011810303, + "objective/train/theoretical_loss": 3.711039703230574, + "objective/train/tokens_used": 860932576, + "theoretical_loss": 3.711039703230574, + "tokens_seen": 840472576 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003764393179538616, + "loss": 3.0709, + "theoretical_loss": 3.7110254887271563, + "tokens_seen": 840505344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003764292878635908, + "loss": 3.0718, + "theoretical_loss": 3.710997061848208, + "tokens_seen": 840570880 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037641925777331996, + "loss": 3.037, + "theoretical_loss": 3.7109686378060234, + "tokens_seen": 840636416 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037640922768304914, + "loss": 3.0109, + "theoretical_loss": 3.7109402166000973, + "tokens_seen": 840701952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003763991975927784, + "loss": 2.9849, + "theoretical_loss": 3.710911798229926, + "tokens_seen": 840767488 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003763891675025075, + "loss": 3.0431, + "theoretical_loss": 3.7108833826950054, + "tokens_seen": 840833024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037637913741223674, + "loss": 2.9579, + "theoretical_loss": 3.7108549699948323, + "tokens_seen": 840898560 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037636910732196586, + "loss": 2.8275, + "theoretical_loss": 3.7108265601289023, + "tokens_seen": 840964096 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003763590772316951, + "loss": 2.7914, + "theoretical_loss": 3.7107981530967122, + "tokens_seen": 841029632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003763490471414243, + "loss": 3.0562, + "theoretical_loss": 3.7107697488977585, + "tokens_seen": 841095168 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037633901705115346, + "loss": 2.9196, + "theoretical_loss": 3.710741347531538, + "tokens_seen": 841160704 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037632898696088264, + "loss": 2.9299, + "theoretical_loss": 3.710712948997547, + "tokens_seen": 841226240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003763189568706119, + "loss": 2.9583, + "theoretical_loss": 3.710684553295283, + "tokens_seen": 841291776 + }, + { + "epoch": 2.05, + "learning_rate": 0.000376308926780341, + "loss": 2.8798, + "theoretical_loss": 3.710656160424243, + "tokens_seen": 841357312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037629889669007024, + "loss": 3.0444, + "theoretical_loss": 3.7106277703839243, + "tokens_seen": 841422848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037628886659979937, + "loss": 2.9979, + "theoretical_loss": 3.7105993831738235, + "tokens_seen": 841488384 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003762788365095286, + "loss": 2.9096, + "theoretical_loss": 3.7105709987934388, + "tokens_seen": 841553920 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003762688064192578, + "loss": 2.9048, + "theoretical_loss": 3.710542617242268, + "tokens_seen": 841619456 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037625877632898696, + "loss": 2.9094, + "theoretical_loss": 3.710514238519808, + "tokens_seen": 841684992 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037624874623871615, + "loss": 2.9662, + "theoretical_loss": 3.710485862625558, + "tokens_seen": 841750528 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003762387161484453, + "loss": 2.925, + "theoretical_loss": 3.7104574895590146, + "tokens_seen": 841816064 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003762286860581745, + "loss": 3.0019, + "theoretical_loss": 3.710429119319676, + "tokens_seen": 841881600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037621865596790374, + "loss": 3.0511, + "theoretical_loss": 3.7104007519070414, + "tokens_seen": 841947136 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037620862587763287, + "loss": 3.0585, + "theoretical_loss": 3.710372387320608, + "tokens_seen": 842012672 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003761985957873621, + "loss": 3.004, + "theoretical_loss": 3.710344025559875, + "tokens_seen": 842078208 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1358700, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9046332836151123, + "objective/train/theoretical_loss": 3.7103298457389897, + "objective/train/tokens_used": 862570976, + "theoretical_loss": 3.7103298457389897, + "tokens_seen": 842110976 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037618856569709123, + "loss": 2.9399, + "theoretical_loss": 3.710315666624341, + "tokens_seen": 842143744 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037617853560682047, + "loss": 2.9568, + "theoretical_loss": 3.710287310513505, + "tokens_seen": 842209280 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037616850551654965, + "loss": 2.9322, + "theoretical_loss": 3.7102589572268654, + "tokens_seen": 842274816 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037615847542627883, + "loss": 2.8909, + "theoretical_loss": 3.710230606763921, + "tokens_seen": 842340352 + }, + { + "epoch": 2.05, + "learning_rate": 0.000376148445336008, + "loss": 2.9715, + "theoretical_loss": 3.7102022591241717, + "tokens_seen": 842405888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037613841524573725, + "loss": 2.9454, + "theoretical_loss": 3.7101739143071164, + "tokens_seen": 842471424 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037612838515546637, + "loss": 3.033, + "theoretical_loss": 3.710145572312254, + "tokens_seen": 842536960 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003761183550651956, + "loss": 2.8721, + "theoretical_loss": 3.7101172331390844, + "tokens_seen": 842602496 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003761083249749248, + "loss": 2.8908, + "theoretical_loss": 3.7100888967871075, + "tokens_seen": 842668032 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037609829488465397, + "loss": 2.8408, + "theoretical_loss": 3.710060563255823, + "tokens_seen": 842733568 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003760882647943832, + "loss": 2.8203, + "theoretical_loss": 3.71003223254473, + "tokens_seen": 842799104 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037607823470411233, + "loss": 2.9337, + "theoretical_loss": 3.71000390465333, + "tokens_seen": 842864640 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037606820461384157, + "loss": 2.8544, + "theoretical_loss": 3.709975579581122, + "tokens_seen": 842930176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003760581745235707, + "loss": 2.8973, + "theoretical_loss": 3.7099472573276064, + "tokens_seen": 842995712 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037604814443329993, + "loss": 2.9315, + "theoretical_loss": 3.709918937892284, + "tokens_seen": 843061248 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003760381143430291, + "loss": 2.8612, + "theoretical_loss": 3.709890621274655, + "tokens_seen": 843126784 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003760280842527583, + "loss": 3.0797, + "theoretical_loss": 3.7098623074742205, + "tokens_seen": 843192320 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003760180541624875, + "loss": 2.7194, + "theoretical_loss": 3.7098339964904805, + "tokens_seen": 843257856 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037600802407221665, + "loss": 2.8613, + "theoretical_loss": 3.7098056883229367, + "tokens_seen": 843323392 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037599799398194584, + "loss": 3.0671, + "theoretical_loss": 3.70977738297109, + "tokens_seen": 843388928 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037598796389167507, + "loss": 2.8745, + "theoretical_loss": 3.709749080434441, + "tokens_seen": 843454464 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003759779338014042, + "loss": 3.1024, + "theoretical_loss": 3.709720780712492, + "tokens_seen": 843520000 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037596790371113343, + "loss": 3.0113, + "theoretical_loss": 3.7096924838047434, + "tokens_seen": 843585536 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003759578736208626, + "loss": 3.0208, + "theoretical_loss": 3.7096641897106974, + "tokens_seen": 843651072 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003759478435305918, + "loss": 3.0783, + "theoretical_loss": 3.7096358984298554, + "tokens_seen": 843716608 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1361841, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.935087203979492, + "objective/train/theoretical_loss": 3.7096217538442304, + "objective/train/tokens_used": 864209376, + "theoretical_loss": 3.7096217538442304, + "tokens_seen": 843749376 + }, + { + "epoch": 2.05, + "learning_rate": 0.000375937813440321, + "loss": 2.9176, + "theoretical_loss": 3.7096076099617195, + "tokens_seen": 843782144 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037592778335005016, + "loss": 2.8424, + "theoretical_loss": 3.7095793243057913, + "tokens_seen": 843847680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037591775325977934, + "loss": 2.9209, + "theoretical_loss": 3.709551041461573, + "tokens_seen": 843913216 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003759077231695086, + "loss": 2.9152, + "theoretical_loss": 3.7095227614285666, + "tokens_seen": 843978752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003758976930792377, + "loss": 3.0601, + "theoretical_loss": 3.709494484206275, + "tokens_seen": 844044288 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037588766298896694, + "loss": 3.0062, + "theoretical_loss": 3.7094662097941997, + "tokens_seen": 844109824 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037587763289869606, + "loss": 2.9372, + "theoretical_loss": 3.7094379381918445, + "tokens_seen": 844175360 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003758676028084253, + "loss": 3.0637, + "theoretical_loss": 3.7094096693987106, + "tokens_seen": 844240896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003758575727181545, + "loss": 2.9076, + "theoretical_loss": 3.7093814034143024, + "tokens_seen": 844306432 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037584754262788366, + "loss": 3.0587, + "theoretical_loss": 3.7093531402381217, + "tokens_seen": 844371968 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037583751253761284, + "loss": 3.0666, + "theoretical_loss": 3.7093248798696714, + "tokens_seen": 844437504 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003758274824473421, + "loss": 3.119, + "theoretical_loss": 3.709296622308456, + "tokens_seen": 844503040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003758174523570712, + "loss": 2.8861, + "theoretical_loss": 3.7092683675539777, + "tokens_seen": 844568576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037580742226680044, + "loss": 2.9826, + "theoretical_loss": 3.7092401156057404, + "tokens_seen": 844634112 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037579739217652957, + "loss": 2.9934, + "theoretical_loss": 3.7092118664632476, + "tokens_seen": 844699648 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003757873620862588, + "loss": 2.8892, + "theoretical_loss": 3.709183620126003, + "tokens_seen": 844765184 + }, + { + "epoch": 2.05, + "learning_rate": 0.000375777331995988, + "loss": 2.9452, + "theoretical_loss": 3.70915537659351, + "tokens_seen": 844830720 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037576730190571716, + "loss": 2.926, + "theoretical_loss": 3.7091271358652738, + "tokens_seen": 844896256 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037575727181544635, + "loss": 3.006, + "theoretical_loss": 3.7090988979407973, + "tokens_seen": 844961792 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003757472417251755, + "loss": 2.8829, + "theoretical_loss": 3.7090706628195846, + "tokens_seen": 845027328 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003757372116349047, + "loss": 2.9381, + "theoretical_loss": 3.7090424305011416, + "tokens_seen": 845092864 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037572718154463394, + "loss": 2.8756, + "theoretical_loss": 3.709014200984971, + "tokens_seen": 845158400 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037571715145436307, + "loss": 3.0301, + "theoretical_loss": 3.708985974270578, + "tokens_seen": 845223936 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003757071213640923, + "loss": 2.8209, + "theoretical_loss": 3.7089577503574676, + "tokens_seen": 845289472 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037569709127382143, + "loss": 2.9455, + "theoretical_loss": 3.7089295292451445, + "tokens_seen": 845355008 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1364607, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1927335262298584, + "objective/train/theoretical_loss": 3.7089154197391236, + "objective/train/tokens_used": 865847776, + "theoretical_loss": 3.7089154197391236, + "tokens_seen": 845387776 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037568706118355067, + "loss": 3.0645, + "theoretical_loss": 3.7089013109331135, + "tokens_seen": 845420544 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037567703109327985, + "loss": 2.8484, + "theoretical_loss": 3.7088730954208797, + "tokens_seen": 845486080 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037566700100300903, + "loss": 2.9066, + "theoretical_loss": 3.7088448827079485, + "tokens_seen": 845551616 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003756569709127382, + "loss": 2.9047, + "theoretical_loss": 3.708816672793825, + "tokens_seen": 845617152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037564694082246745, + "loss": 2.9404, + "theoretical_loss": 3.7087884656780155, + "tokens_seen": 845682688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003756369107321966, + "loss": 3.0628, + "theoretical_loss": 3.708760261360024, + "tokens_seen": 845748224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003756268806419258, + "loss": 2.9892, + "theoretical_loss": 3.708732059839358, + "tokens_seen": 845813760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037561685055165494, + "loss": 2.8019, + "theoretical_loss": 3.708703861115522, + "tokens_seen": 845879296 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037560682046138417, + "loss": 3.0048, + "theoretical_loss": 3.7086756651880224, + "tokens_seen": 845944832 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037559679037111335, + "loss": 3.1925, + "theoretical_loss": 3.7086474720563656, + "tokens_seen": 846010368 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037558676028084253, + "loss": 3.0037, + "theoretical_loss": 3.708619281720057, + "tokens_seen": 846075904 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755767301905717, + "loss": 2.8817, + "theoretical_loss": 3.708591094178604, + "tokens_seen": 846141440 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755667001003009, + "loss": 2.876, + "theoretical_loss": 3.708562909431513, + "tokens_seen": 846206976 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755566700100301, + "loss": 3.035, + "theoretical_loss": 3.7085347274782894, + "tokens_seen": 846272512 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755466399197593, + "loss": 3.0249, + "theoretical_loss": 3.708506548318441, + "tokens_seen": 846338048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037553660982948844, + "loss": 2.8142, + "theoretical_loss": 3.708478371951475, + "tokens_seen": 846403584 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755265797392177, + "loss": 2.9504, + "theoretical_loss": 3.708450198376897, + "tokens_seen": 846469120 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003755165496489468, + "loss": 2.9726, + "theoretical_loss": 3.708422027594215, + "tokens_seen": 846534656 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037550651955867604, + "loss": 3.032, + "theoretical_loss": 3.7083938596029364, + "tokens_seen": 846600192 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754964894684052, + "loss": 2.8593, + "theoretical_loss": 3.708365694402568, + "tokens_seen": 846665728 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754864593781344, + "loss": 2.9764, + "theoretical_loss": 3.7083375319926177, + "tokens_seen": 846731264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754764292878636, + "loss": 2.8029, + "theoretical_loss": 3.708309372372593, + "tokens_seen": 846796800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754663991975928, + "loss": 2.9924, + "theoretical_loss": 3.708281215542001, + "tokens_seen": 846862336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037545636910732194, + "loss": 3.0905, + "theoretical_loss": 3.7082530615003506, + "tokens_seen": 846927872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754463390170512, + "loss": 2.9296, + "theoretical_loss": 3.708224910247149, + "tokens_seen": 846993408 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1366742, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9962239265441895, + "objective/train/theoretical_loss": 3.708210835666063, + "objective/train/tokens_used": 867486176, + "theoretical_loss": 3.708210835666063, + "tokens_seen": 847026176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754363089267803, + "loss": 2.8932, + "theoretical_loss": 3.7081967617819047, + "tokens_seen": 847058944 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037542627883650954, + "loss": 2.9914, + "theoretical_loss": 3.7081686161041256, + "tokens_seen": 847124480 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754162487462387, + "loss": 3.0546, + "theoretical_loss": 3.7081404732133203, + "tokens_seen": 847190016 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003754062186559679, + "loss": 2.9677, + "theoretical_loss": 3.7081123331089976, + "tokens_seen": 847255552 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003753961885656971, + "loss": 3.0796, + "theoretical_loss": 3.7080841957906654, + "tokens_seen": 847321088 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037538615847542626, + "loss": 2.7315, + "theoretical_loss": 3.708056061257833, + "tokens_seen": 847386624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037537612838515544, + "loss": 2.9727, + "theoretical_loss": 3.7080279295100085, + "tokens_seen": 847452160 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003753660982948847, + "loss": 2.914, + "theoretical_loss": 3.707999800546702, + "tokens_seen": 847517696 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037535606820461386, + "loss": 2.905, + "theoretical_loss": 3.7079716743674216, + "tokens_seen": 847583232 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037534603811434304, + "loss": 2.8099, + "theoretical_loss": 3.707943550971677, + "tokens_seen": 847648768 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003753360080240723, + "loss": 3.0331, + "theoretical_loss": 3.707915430358977, + "tokens_seen": 847714304 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003753259779338014, + "loss": 2.8778, + "theoretical_loss": 3.707887312528832, + "tokens_seen": 847779840 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037531594784353064, + "loss": 3.0039, + "theoretical_loss": 3.707859197480751, + "tokens_seen": 847845376 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037530591775325977, + "loss": 3.0571, + "theoretical_loss": 3.7078310852142438, + "tokens_seen": 847910912 + }, + { + "epoch": 2.05, + "learning_rate": 0.000375295887662989, + "loss": 2.9749, + "theoretical_loss": 3.7078029757288204, + "tokens_seen": 847976448 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003752858575727182, + "loss": 3.0606, + "theoretical_loss": 3.7077748690239902, + "tokens_seen": 848041984 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037527582748244736, + "loss": 3.0119, + "theoretical_loss": 3.707746765099264, + "tokens_seen": 848107520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037526579739217655, + "loss": 2.8862, + "theoretical_loss": 3.707718663954152, + "tokens_seen": 848173056 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003752557673019057, + "loss": 2.9118, + "theoretical_loss": 3.7076905655881642, + "tokens_seen": 848238592 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003752457372116349, + "loss": 2.8543, + "theoretical_loss": 3.7076624700008107, + "tokens_seen": 848304128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037523570712136414, + "loss": 2.9509, + "theoretical_loss": 3.707634377191603, + "tokens_seen": 848369664 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037522567703109327, + "loss": 3.0627, + "theoretical_loss": 3.7076062871600515, + "tokens_seen": 848435200 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003752156469408225, + "loss": 3.0091, + "theoretical_loss": 3.7075781999056665, + "tokens_seen": 848500736 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037520561685055163, + "loss": 2.9548, + "theoretical_loss": 3.70755011542796, + "tokens_seen": 848566272 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037519558676028087, + "loss": 2.9462, + "theoretical_loss": 3.707522033726441, + "tokens_seen": 848631808 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1369647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.730207920074463, + "objective/train/theoretical_loss": 3.7075079939166002, + "objective/train/tokens_used": 869124576, + "theoretical_loss": 3.7075079939166002, + "tokens_seen": 848664576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037518555667001005, + "loss": 2.9269, + "theoretical_loss": 3.707493954800624, + "tokens_seen": 848697344 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037517552657973923, + "loss": 2.8813, + "theoretical_loss": 3.707465878650017, + "tokens_seen": 848762880 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003751654964894684, + "loss": 3.1588, + "theoretical_loss": 3.707437805274134, + "tokens_seen": 848828416 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037515546639919765, + "loss": 2.9714, + "theoretical_loss": 3.707409734672485, + "tokens_seen": 848893952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003751454363089268, + "loss": 2.87, + "theoretical_loss": 3.707381666844582, + "tokens_seen": 848959488 + }, + { + "epoch": 2.05, + "learning_rate": 0.000375135406218656, + "loss": 2.9683, + "theoretical_loss": 3.7073536017899373, + "tokens_seen": 849025024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037512537612838514, + "loss": 3.0506, + "theoretical_loss": 3.707325539508063, + "tokens_seen": 849090560 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037511534603811437, + "loss": 2.8479, + "theoretical_loss": 3.7072974799984704, + "tokens_seen": 849156096 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037510531594784355, + "loss": 2.9103, + "theoretical_loss": 3.707269423260672, + "tokens_seen": 849221632 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037509528585757273, + "loss": 2.9999, + "theoretical_loss": 3.7072413692941804, + "tokens_seen": 849287168 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750852557673019, + "loss": 3.0242, + "theoretical_loss": 3.707213318098508, + "tokens_seen": 849352704 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750752256770311, + "loss": 3.0458, + "theoretical_loss": 3.707185269673167, + "tokens_seen": 849418240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750651955867603, + "loss": 2.8534, + "theoretical_loss": 3.7071572240176702, + "tokens_seen": 849483776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750551654964895, + "loss": 2.9704, + "theoretical_loss": 3.7071291811315303, + "tokens_seen": 849549312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037504513540621864, + "loss": 3.1412, + "theoretical_loss": 3.7071011410142614, + "tokens_seen": 849614848 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750351053159479, + "loss": 2.9395, + "theoretical_loss": 3.7070731036653743, + "tokens_seen": 849680384 + }, + { + "epoch": 2.05, + "learning_rate": 0.000375025075225677, + "loss": 2.9936, + "theoretical_loss": 3.7070450690843844, + "tokens_seen": 849745920 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037501504513540624, + "loss": 3.0052, + "theoretical_loss": 3.7070170372708042, + "tokens_seen": 849811456 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003750050150451354, + "loss": 2.88, + "theoretical_loss": 3.706989008224147, + "tokens_seen": 849876992 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749949849548646, + "loss": 3.115, + "theoretical_loss": 3.7069609819439258, + "tokens_seen": 849942528 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749849548645938, + "loss": 2.89, + "theoretical_loss": 3.7069329584296553, + "tokens_seen": 850008064 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374974924774323, + "loss": 2.8377, + "theoretical_loss": 3.706904937680849, + "tokens_seen": 850073600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037496489468405214, + "loss": 2.9411, + "theoretical_loss": 3.7068769196970206, + "tokens_seen": 850139136 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749548645937814, + "loss": 3.0076, + "theoretical_loss": 3.706848904477684, + "tokens_seen": 850204672 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749448345035105, + "loss": 2.9767, + "theoretical_loss": 3.706820892022354, + "tokens_seen": 850270208 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1372309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1208808422088623, + "objective/train/theoretical_loss": 3.7068068868310395, + "objective/train/tokens_used": 870762976, + "theoretical_loss": 3.7068068868310395, + "tokens_seen": 850302976 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037493480441323974, + "loss": 3.055, + "theoretical_loss": 3.7067928823305443, + "tokens_seen": 850335744 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749247743229689, + "loss": 2.9401, + "theoretical_loss": 3.7067648754017695, + "tokens_seen": 850401280 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749147442326981, + "loss": 3.0636, + "theoretical_loss": 3.7067368712355444, + "tokens_seen": 850466816 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003749047141424273, + "loss": 2.974, + "theoretical_loss": 3.7067088698313837, + "tokens_seen": 850532352 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037489468405215646, + "loss": 2.8921, + "theoretical_loss": 3.7066808711888006, + "tokens_seen": 850597888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037488465396188564, + "loss": 3.0126, + "theoretical_loss": 3.7066528753073125, + "tokens_seen": 850663424 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003748746238716149, + "loss": 3.0312, + "theoretical_loss": 3.706624882186433, + "tokens_seen": 850728960 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374864593781344, + "loss": 2.9824, + "theoretical_loss": 3.7065968918256775, + "tokens_seen": 850794496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037485456369107324, + "loss": 2.7934, + "theoretical_loss": 3.706568904224561, + "tokens_seen": 850860032 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037484453360080237, + "loss": 2.9848, + "theoretical_loss": 3.7065409193825998, + "tokens_seen": 850925568 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003748345035105316, + "loss": 2.9744, + "theoretical_loss": 3.706512937299308, + "tokens_seen": 850991104 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003748244734202608, + "loss": 3.0423, + "theoretical_loss": 3.7064849579742027, + "tokens_seen": 851056640 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037481444332998997, + "loss": 3.1128, + "theoretical_loss": 3.706456981406798, + "tokens_seen": 851122176 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037480441323971915, + "loss": 3.0286, + "theoretical_loss": 3.7064290075966113, + "tokens_seen": 851187712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003747943831494484, + "loss": 3.1896, + "theoretical_loss": 3.7064010365431583, + "tokens_seen": 851253248 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003747843530591775, + "loss": 3.0186, + "theoretical_loss": 3.706373068245955, + "tokens_seen": 851318784 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037477432296890675, + "loss": 2.8089, + "theoretical_loss": 3.7063451027045176, + "tokens_seen": 851384320 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037476429287863587, + "loss": 3.1464, + "theoretical_loss": 3.7063171399183616, + "tokens_seen": 851449856 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003747542627883651, + "loss": 3.0634, + "theoretical_loss": 3.706289179887005, + "tokens_seen": 851515392 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003747442326980943, + "loss": 2.9396, + "theoretical_loss": 3.706261222609964, + "tokens_seen": 851580928 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037473420260782347, + "loss": 3.0448, + "theoretical_loss": 3.7062332680867542, + "tokens_seen": 851646464 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037472417251755265, + "loss": 3.0598, + "theoretical_loss": 3.7062053163168933, + "tokens_seen": 851712000 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037471414242728183, + "loss": 2.9598, + "theoretical_loss": 3.706177367299899, + "tokens_seen": 851777536 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374704112337011, + "loss": 2.8566, + "theoretical_loss": 3.7061494210352874, + "tokens_seen": 851843072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037469408224674025, + "loss": 2.9008, + "theoretical_loss": 3.7061214775225766, + "tokens_seen": 851908608 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1375128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.997480869293213, + "objective/train/theoretical_loss": 3.7061075067980327, + "objective/train/tokens_used": 872401376, + "theoretical_loss": 3.7061075067980327, + "tokens_seen": 851941376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003746840521564694, + "loss": 2.9529, + "theoretical_loss": 3.7060935367612826, + "tokens_seen": 851974144 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003746740220661986, + "loss": 2.9913, + "theoretical_loss": 3.7060655987509246, + "tokens_seen": 852039680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037466399197592774, + "loss": 3.0511, + "theoretical_loss": 3.706037663491019, + "tokens_seen": 852105216 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374653961885657, + "loss": 2.8334, + "theoretical_loss": 3.706009730981083, + "tokens_seen": 852170752 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037464393179538615, + "loss": 2.9704, + "theoretical_loss": 3.705981801220636, + "tokens_seen": 852236288 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037463390170511534, + "loss": 3.0599, + "theoretical_loss": 3.705953874209195, + "tokens_seen": 852301824 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037462387161484457, + "loss": 3.0536, + "theoretical_loss": 3.7059259499462778, + "tokens_seen": 852367360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037461384152457375, + "loss": 2.9712, + "theoretical_loss": 3.705898028431404, + "tokens_seen": 852432896 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037460381143430293, + "loss": 3.106, + "theoretical_loss": 3.7058701096640903, + "tokens_seen": 852498432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745937813440321, + "loss": 3.0733, + "theoretical_loss": 3.7058421936438557, + "tokens_seen": 852563968 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745837512537613, + "loss": 2.916, + "theoretical_loss": 3.7058142803702188, + "tokens_seen": 852629504 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745737211634905, + "loss": 2.9078, + "theoretical_loss": 3.7057863698426994, + "tokens_seen": 852695040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745636910732197, + "loss": 2.9002, + "theoretical_loss": 3.705758462060814, + "tokens_seen": 852760576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037455366098294884, + "loss": 2.9557, + "theoretical_loss": 3.7057305570240837, + "tokens_seen": 852826112 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745436308926781, + "loss": 3.0314, + "theoretical_loss": 3.7057026547320264, + "tokens_seen": 852891648 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745336008024072, + "loss": 2.9457, + "theoretical_loss": 3.7056747551841616, + "tokens_seen": 852957184 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037452357071213644, + "loss": 2.9739, + "theoretical_loss": 3.705646858380008, + "tokens_seen": 853022720 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745135406218656, + "loss": 3.0425, + "theoretical_loss": 3.705618964319086, + "tokens_seen": 853088256 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003745035105315948, + "loss": 2.9702, + "theoretical_loss": 3.7055910730009147, + "tokens_seen": 853153792 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374493480441324, + "loss": 3.0997, + "theoretical_loss": 3.7055631844250136, + "tokens_seen": 853219328 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744834503510532, + "loss": 2.9588, + "theoretical_loss": 3.7055352985909025, + "tokens_seen": 853284864 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037447342026078234, + "loss": 3.0057, + "theoretical_loss": 3.7055074154981016, + "tokens_seen": 853350400 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744633901705116, + "loss": 3.0146, + "theoretical_loss": 3.705479535146131, + "tokens_seen": 853415936 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744533600802407, + "loss": 2.9319, + "theoretical_loss": 3.70545165753451, + "tokens_seen": 853481472 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037444332998996994, + "loss": 3.034, + "theoretical_loss": 3.7054237826627596, + "tokens_seen": 853547008 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1376507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7947936058044434, + "objective/train/theoretical_loss": 3.7054098462541862, + "objective/train/tokens_used": 874039776, + "theoretical_loss": 3.7054098462541862, + "tokens_seen": 853579776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744332998996991, + "loss": 2.9844, + "theoretical_loss": 3.7053959105304006, + "tokens_seen": 853612544 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744232698094283, + "loss": 2.921, + "theoretical_loss": 3.705368041136952, + "tokens_seen": 853678080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003744132397191575, + "loss": 2.983, + "theoretical_loss": 3.7053401744819356, + "tokens_seen": 853743616 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037440320962888666, + "loss": 2.9322, + "theoretical_loss": 3.7053123105648718, + "tokens_seen": 853809152 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037439317953861585, + "loss": 3.0152, + "theoretical_loss": 3.7052844493852817, + "tokens_seen": 853874688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003743831494483451, + "loss": 2.9872, + "theoretical_loss": 3.7052565909426862, + "tokens_seen": 853940224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003743731193580742, + "loss": 2.9602, + "theoretical_loss": 3.7052287352366067, + "tokens_seen": 854005760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037436308926780344, + "loss": 2.9586, + "theoretical_loss": 3.7052008822665634, + "tokens_seen": 854071296 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037435305917753257, + "loss": 2.9226, + "theoretical_loss": 3.7051730320320786, + "tokens_seen": 854136832 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003743430290872618, + "loss": 2.991, + "theoretical_loss": 3.7051451845326735, + "tokens_seen": 854202368 + }, + { + "epoch": 2.05, + "learning_rate": 0.000374332998996991, + "loss": 3.0132, + "theoretical_loss": 3.7051173397678694, + "tokens_seen": 854267904 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037432296890672017, + "loss": 2.9738, + "theoretical_loss": 3.7050894977371884, + "tokens_seen": 854333440 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037431293881644935, + "loss": 2.6965, + "theoretical_loss": 3.705061658440152, + "tokens_seen": 854398976 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003743029087261786, + "loss": 2.8344, + "theoretical_loss": 3.705033821876283, + "tokens_seen": 854464512 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003742928786359077, + "loss": 2.9038, + "theoretical_loss": 3.7050059880451025, + "tokens_seen": 854530048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037428284854563695, + "loss": 2.9595, + "theoretical_loss": 3.704978156946132, + "tokens_seen": 854595584 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037427281845536607, + "loss": 2.9456, + "theoretical_loss": 3.704950328578896, + "tokens_seen": 854661120 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003742627883650953, + "loss": 2.9223, + "theoretical_loss": 3.7049225029429156, + "tokens_seen": 854726656 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003742527582748245, + "loss": 3.006, + "theoretical_loss": 3.704894680037713, + "tokens_seen": 854792192 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037424272818455367, + "loss": 3.0117, + "theoretical_loss": 3.704866859862812, + "tokens_seen": 854857728 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037423269809428285, + "loss": 2.9285, + "theoretical_loss": 3.704839042417734, + "tokens_seen": 854923264 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037422266800401203, + "loss": 2.9576, + "theoretical_loss": 3.7048112277020033, + "tokens_seen": 854988800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003742126379137412, + "loss": 3.1281, + "theoretical_loss": 3.704783415715142, + "tokens_seen": 855054336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037420260782347045, + "loss": 3.0996, + "theoretical_loss": 3.7047556064566733, + "tokens_seen": 855119872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741925777331996, + "loss": 2.9224, + "theoretical_loss": 3.704727799926121, + "tokens_seen": 855185408 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1380476, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.042532444000244, + "objective/train/theoretical_loss": 3.704713897683664, + "objective/train/tokens_used": 875678176, + "theoretical_loss": 3.704713897683664, + "tokens_seen": 855218176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741825476429288, + "loss": 2.9678, + "theoretical_loss": 3.704699996123008, + "tokens_seen": 855250944 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037417251755265794, + "loss": 2.9639, + "theoretical_loss": 3.704672195046858, + "tokens_seen": 855316480 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741624874623872, + "loss": 2.9729, + "theoretical_loss": 3.704644396697195, + "tokens_seen": 855382016 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037415245737211635, + "loss": 2.9183, + "theoretical_loss": 3.7046166010735417, + "tokens_seen": 855447552 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037414242728184554, + "loss": 3.0126, + "theoretical_loss": 3.7045888081754237, + "tokens_seen": 855513088 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741323971915747, + "loss": 2.9743, + "theoretical_loss": 3.704561018002363, + "tokens_seen": 855578624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037412236710130395, + "loss": 3.1039, + "theoretical_loss": 3.704533230553885, + "tokens_seen": 855644160 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741123370110331, + "loss": 2.8606, + "theoretical_loss": 3.704505445829513, + "tokens_seen": 855709696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003741023069207623, + "loss": 3.0294, + "theoretical_loss": 3.704477663828773, + "tokens_seen": 855775232 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037409227683049144, + "loss": 2.8879, + "theoretical_loss": 3.7044498845511873, + "tokens_seen": 855840768 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740822467402207, + "loss": 2.8135, + "theoretical_loss": 3.7044221079962822, + "tokens_seen": 855906304 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037407221664994986, + "loss": 2.9505, + "theoretical_loss": 3.7043943341635814, + "tokens_seen": 855971840 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037406218655967904, + "loss": 3.0089, + "theoretical_loss": 3.70436656305261, + "tokens_seen": 856037376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740521564694082, + "loss": 3.0585, + "theoretical_loss": 3.7043387946628936, + "tokens_seen": 856102912 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740421263791374, + "loss": 2.9203, + "theoretical_loss": 3.7043110289939563, + "tokens_seen": 856168448 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740320962888666, + "loss": 2.8678, + "theoretical_loss": 3.7042832660453238, + "tokens_seen": 856233984 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740220661985958, + "loss": 2.8759, + "theoretical_loss": 3.7042555058165214, + "tokens_seen": 856299520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037401203610832494, + "loss": 3.1037, + "theoretical_loss": 3.7042277483070745, + "tokens_seen": 856365056 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003740020060180542, + "loss": 2.9943, + "theoretical_loss": 3.704199993516508, + "tokens_seen": 856430592 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037399197592778336, + "loss": 2.9545, + "theoretical_loss": 3.7041722414443483, + "tokens_seen": 856496128 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037398194583751254, + "loss": 2.9776, + "theoretical_loss": 3.704144492090121, + "tokens_seen": 856561664 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739719157472417, + "loss": 2.9682, + "theoretical_loss": 3.704116745453352, + "tokens_seen": 856627200 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739618856569709, + "loss": 3.058, + "theoretical_loss": 3.7040890015335672, + "tokens_seen": 856692736 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739518555667001, + "loss": 2.8806, + "theoretical_loss": 3.7040612603302927, + "tokens_seen": 856758272 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739418254764293, + "loss": 3.0113, + "theoretical_loss": 3.7040335218430553, + "tokens_seen": 856823808 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1381916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.183521270751953, + "objective/train/theoretical_loss": 3.7040196536178023, + "objective/train/tokens_used": 877316576, + "theoretical_loss": 3.7040196536178023, + "tokens_seen": 856856576 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037393179538615845, + "loss": 2.8277, + "theoretical_loss": 3.7040057860713804, + "tokens_seen": 856889344 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739217652958877, + "loss": 2.9992, + "theoretical_loss": 3.7039780530147954, + "tokens_seen": 856954880 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003739117352056168, + "loss": 2.9608, + "theoretical_loss": 3.703950322672826, + "tokens_seen": 857020416 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037390170511534605, + "loss": 2.7913, + "theoretical_loss": 3.7039225950450003, + "tokens_seen": 857085952 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003738916750250752, + "loss": 2.9736, + "theoretical_loss": 3.703894870130844, + "tokens_seen": 857151488 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003738816449348044, + "loss": 2.9525, + "theoretical_loss": 3.703867147929884, + "tokens_seen": 857217024 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037387161484453364, + "loss": 2.9483, + "theoretical_loss": 3.7038394284416483, + "tokens_seen": 857282560 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037386158475426277, + "loss": 2.8424, + "theoretical_loss": 3.7038117116656633, + "tokens_seen": 857348096 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373851554663992, + "loss": 3.0483, + "theoretical_loss": 3.7037839976014566, + "tokens_seen": 857413632 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003738415245737212, + "loss": 3.0027, + "theoretical_loss": 3.703756286248556, + "tokens_seen": 857479168 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037383149448345037, + "loss": 2.8808, + "theoretical_loss": 3.703728577606488, + "tokens_seen": 857544704 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037382146439317955, + "loss": 2.8638, + "theoretical_loss": 3.7037008716747812, + "tokens_seen": 857610240 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003738114343029088, + "loss": 3.0393, + "theoretical_loss": 3.703673168452963, + "tokens_seen": 857675776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003738014042126379, + "loss": 2.8422, + "theoretical_loss": 3.7036454679405617, + "tokens_seen": 857741312 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037379137412236715, + "loss": 2.8964, + "theoretical_loss": 3.7036177701371056, + "tokens_seen": 857806848 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037378134403209627, + "loss": 2.8866, + "theoretical_loss": 3.703590075042121, + "tokens_seen": 857872384 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003737713139418255, + "loss": 2.9896, + "theoretical_loss": 3.703562382655139, + "tokens_seen": 857937920 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003737612838515547, + "loss": 3.0748, + "theoretical_loss": 3.7035346929756856, + "tokens_seen": 858003456 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037375125376128387, + "loss": 3.0441, + "theoretical_loss": 3.7035070060032904, + "tokens_seen": 858068992 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037374122367101305, + "loss": 3.0327, + "theoretical_loss": 3.703479321737482, + "tokens_seen": 858134528 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037373119358074223, + "loss": 2.9579, + "theoretical_loss": 3.703451640177789, + "tokens_seen": 858200064 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003737211634904714, + "loss": 2.8862, + "theoretical_loss": 3.7034239613237396, + "tokens_seen": 858265600 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037371113340020065, + "loss": 2.9772, + "theoretical_loss": 3.703396285174864, + "tokens_seen": 858331136 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003737011033099298, + "loss": 3.0003, + "theoretical_loss": 3.7033686117306908, + "tokens_seen": 858396672 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373691073219659, + "loss": 2.7422, + "theoretical_loss": 3.703340940990749, + "tokens_seen": 858462208 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1384663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7580673694610596, + "objective/train/theoretical_loss": 3.7033271066347178, + "objective/train/tokens_used": 878954976, + "theoretical_loss": 3.7033271066347178, + "tokens_seen": 858494976 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037368104312938814, + "loss": 2.8691, + "theoretical_loss": 3.703313272954568, + "tokens_seen": 858527744 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003736710130391174, + "loss": 2.8233, + "theoretical_loss": 3.7032856076216767, + "tokens_seen": 858593280 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037366098294884655, + "loss": 2.9888, + "theoretical_loss": 3.7032579449916065, + "tokens_seen": 858658816 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037365095285857574, + "loss": 3.0966, + "theoretical_loss": 3.703230285063885, + "tokens_seen": 858724352 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003736409227683049, + "loss": 2.9732, + "theoretical_loss": 3.7032026278380425, + "tokens_seen": 858789888 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037363089267803415, + "loss": 2.975, + "theoretical_loss": 3.70317497331361, + "tokens_seen": 858855424 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003736208625877633, + "loss": 3.0573, + "theoretical_loss": 3.7031473214901167, + "tokens_seen": 858920960 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003736108324974925, + "loss": 2.9175, + "theoretical_loss": 3.7031196723670923, + "tokens_seen": 858986496 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037360080240722164, + "loss": 2.9484, + "theoretical_loss": 3.703092025944068, + "tokens_seen": 859052032 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003735907723169509, + "loss": 2.901, + "theoretical_loss": 3.7030643822205738, + "tokens_seen": 859117568 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037358074222668006, + "loss": 2.868, + "theoretical_loss": 3.70303674119614, + "tokens_seen": 859183104 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037357071213640924, + "loss": 2.806, + "theoretical_loss": 3.703009102870298, + "tokens_seen": 859248640 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003735606820461384, + "loss": 2.9984, + "theoretical_loss": 3.702981467242578, + "tokens_seen": 859314176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003735506519558676, + "loss": 3.0834, + "theoretical_loss": 3.7029538343125106, + "tokens_seen": 859379712 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003735406218655968, + "loss": 2.9274, + "theoretical_loss": 3.7029262040796267, + "tokens_seen": 859445248 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373530591775326, + "loss": 2.8378, + "theoretical_loss": 3.7028985765434577, + "tokens_seen": 859510784 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037352056168505514, + "loss": 2.9778, + "theoretical_loss": 3.7028709517035354, + "tokens_seen": 859576320 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003735105315947844, + "loss": 2.9356, + "theoretical_loss": 3.70284332955939, + "tokens_seen": 859641856 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037350050150451356, + "loss": 2.9518, + "theoretical_loss": 3.7028157101105537, + "tokens_seen": 859707392 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037349047141424274, + "loss": 2.9231, + "theoretical_loss": 3.702788093356558, + "tokens_seen": 859772928 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734804413239719, + "loss": 3.0761, + "theoretical_loss": 3.702760479296934, + "tokens_seen": 859838464 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734704112337011, + "loss": 2.8693, + "theoretical_loss": 3.702732867931214, + "tokens_seen": 859904000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734603811434303, + "loss": 3.012, + "theoretical_loss": 3.7027052592589307, + "tokens_seen": 859969536 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734503510531595, + "loss": 2.979, + "theoretical_loss": 3.702677653279615, + "tokens_seen": 860035072 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037344032096288865, + "loss": 2.9879, + "theoretical_loss": 3.7026500499927986, + "tokens_seen": 860100608 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1387487, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.769348382949829, + "objective/train/theoretical_loss": 3.7026362493589318, + "objective/train/tokens_used": 880593376, + "theoretical_loss": 3.7026362493589318, + "tokens_seen": 860133376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734302908726179, + "loss": 2.9493, + "theoretical_loss": 3.702622449398015, + "tokens_seen": 860166144 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373420260782347, + "loss": 3.1075, + "theoretical_loss": 3.702594851494796, + "tokens_seen": 860231680 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037341023069207625, + "loss": 2.9739, + "theoretical_loss": 3.7025672562826744, + "tokens_seen": 860297216 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003734002006018054, + "loss": 2.7997, + "theoretical_loss": 3.702539663761182, + "tokens_seen": 860362752 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003733901705115346, + "loss": 2.8909, + "theoretical_loss": 3.702512073929853, + "tokens_seen": 860428288 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003733801404212638, + "loss": 3.0551, + "theoretical_loss": 3.7024844867882187, + "tokens_seen": 860493824 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037337011033099297, + "loss": 2.9193, + "theoretical_loss": 3.702456902335813, + "tokens_seen": 860559360 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037336008024072215, + "loss": 2.9059, + "theoretical_loss": 3.7024293205721683, + "tokens_seen": 860624896 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003733500501504514, + "loss": 2.9866, + "theoretical_loss": 3.7024017414968187, + "tokens_seen": 860690432 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003733400200601805, + "loss": 3.0207, + "theoretical_loss": 3.7023741651092967, + "tokens_seen": 860755968 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037332998996990975, + "loss": 2.9577, + "theoretical_loss": 3.7023465914091362, + "tokens_seen": 860821504 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037331995987963893, + "loss": 2.9699, + "theoretical_loss": 3.702319020395871, + "tokens_seen": 860887040 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003733099297893681, + "loss": 2.9984, + "theoretical_loss": 3.7022914520690344, + "tokens_seen": 860952576 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732998996990973, + "loss": 3.0171, + "theoretical_loss": 3.7022638864281596, + "tokens_seen": 861018112 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732898696088265, + "loss": 2.7939, + "theoretical_loss": 3.702236323472781, + "tokens_seen": 861083648 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037327983951855565, + "loss": 3.0251, + "theoretical_loss": 3.7022087632024334, + "tokens_seen": 861149184 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732698094282849, + "loss": 2.9463, + "theoretical_loss": 3.7021812056166494, + "tokens_seen": 861214720 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373259779338014, + "loss": 2.9513, + "theoretical_loss": 3.7021536507149646, + "tokens_seen": 861280256 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037324974924774325, + "loss": 2.9669, + "theoretical_loss": 3.7021260984969127, + "tokens_seen": 861345792 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732397191574724, + "loss": 3.0842, + "theoretical_loss": 3.702098548962028, + "tokens_seen": 861411328 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732296890672016, + "loss": 2.8935, + "theoretical_loss": 3.7020710021098457, + "tokens_seen": 861476864 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003732196589769308, + "loss": 2.8587, + "theoretical_loss": 3.7020434579399004, + "tokens_seen": 861542400 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037320962888666, + "loss": 2.9489, + "theoretical_loss": 3.7020159164517263, + "tokens_seen": 861607936 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037319959879638916, + "loss": 2.9749, + "theoretical_loss": 3.701988377644859, + "tokens_seen": 861673472 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037318956870611834, + "loss": 3.0112, + "theoretical_loss": 3.701960841518833, + "tokens_seen": 861739008 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1390370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9685933589935303, + "objective/train/theoretical_loss": 3.701947074460991, + "objective/train/tokens_used": 882231776, + "theoretical_loss": 3.701947074460991, + "tokens_seen": 861771776 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731795386158475, + "loss": 3.0295, + "theoretical_loss": 3.701933308073184, + "tokens_seen": 861804544 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037316950852557675, + "loss": 2.7227, + "theoretical_loss": 3.7019057773074473, + "tokens_seen": 861870080 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731594784353059, + "loss": 2.9065, + "theoretical_loss": 3.701878249221158, + "tokens_seen": 861935616 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731494483450351, + "loss": 2.8347, + "theoretical_loss": 3.701850723813852, + "tokens_seen": 862001152 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731394182547643, + "loss": 2.883, + "theoretical_loss": 3.7018232010850642, + "tokens_seen": 862066688 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731293881644935, + "loss": 3.0355, + "theoretical_loss": 3.701795681034331, + "tokens_seen": 862132224 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003731193580742227, + "loss": 2.9568, + "theoretical_loss": 3.701768163661188, + "tokens_seen": 862197760 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037310932798395184, + "loss": 2.986, + "theoretical_loss": 3.7017406489651714, + "tokens_seen": 862263296 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003730992978936811, + "loss": 3.0888, + "theoretical_loss": 3.701713136945817, + "tokens_seen": 862328832 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037308926780341026, + "loss": 2.9791, + "theoretical_loss": 3.7016856276026613, + "tokens_seen": 862394368 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037307923771313944, + "loss": 2.866, + "theoretical_loss": 3.7016581209352406, + "tokens_seen": 862459904 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003730692076228686, + "loss": 2.8743, + "theoretical_loss": 3.701630616943091, + "tokens_seen": 862525440 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003730591775325978, + "loss": 2.7641, + "theoretical_loss": 3.7016031156257494, + "tokens_seen": 862590976 + }, + { + "epoch": 2.05, + "learning_rate": 0.000373049147442327, + "loss": 2.8712, + "theoretical_loss": 3.701575616982753, + "tokens_seen": 862656512 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003730391173520562, + "loss": 3.0581, + "theoretical_loss": 3.701548121013637, + "tokens_seen": 862722048 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037302908726178534, + "loss": 2.9978, + "theoretical_loss": 3.70152062771794, + "tokens_seen": 862787584 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003730190571715146, + "loss": 2.9388, + "theoretical_loss": 3.7014931370951984, + "tokens_seen": 862853120 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037300902708124376, + "loss": 3.0744, + "theoretical_loss": 3.701465649144949, + "tokens_seen": 862918656 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037299899699097294, + "loss": 2.9562, + "theoretical_loss": 3.70143816386673, + "tokens_seen": 862984192 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729889669007021, + "loss": 2.9354, + "theoretical_loss": 3.7014106812600778, + "tokens_seen": 863049728 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729789368104313, + "loss": 3.0097, + "theoretical_loss": 3.70138320132453, + "tokens_seen": 863115264 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729689067201605, + "loss": 3.0616, + "theoretical_loss": 3.701355724059624, + "tokens_seen": 863180800 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729588766298897, + "loss": 2.9367, + "theoretical_loss": 3.7013282494648987, + "tokens_seen": 863246336 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037294884653961885, + "loss": 2.6795, + "theoretical_loss": 3.701300777539891, + "tokens_seen": 863311872 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729388164493481, + "loss": 3.0528, + "theoretical_loss": 3.701273308284139, + "tokens_seen": 863377408 + }, + { + "epoch": 2.05, + "objective/train/docs_used": 1392731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.83223819732666, + "objective/train/theoretical_loss": 3.7012595746570893, + "objective/train/tokens_used": 883870176, + "theoretical_loss": 3.7012595746570893, + "tokens_seen": 863410176 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729287863590772, + "loss": 2.8795, + "theoretical_loss": 3.7012458416971805, + "tokens_seen": 863442944 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037291875626880645, + "loss": 2.9912, + "theoretical_loss": 3.701218377778554, + "tokens_seen": 863508480 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003729087261785356, + "loss": 2.9812, + "theoretical_loss": 3.701190916527798, + "tokens_seen": 863574016 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003728986960882648, + "loss": 2.9687, + "theoretical_loss": 3.7011634579444506, + "tokens_seen": 863639552 + }, + { + "epoch": 2.05, + "learning_rate": 0.000372888665997994, + "loss": 2.8748, + "theoretical_loss": 3.70113600202805, + "tokens_seen": 863705088 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037287863590772317, + "loss": 2.9578, + "theoretical_loss": 3.7011085487781354, + "tokens_seen": 863770624 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037286860581745235, + "loss": 2.9853, + "theoretical_loss": 3.701081098194246, + "tokens_seen": 863836160 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003728585757271816, + "loss": 2.9113, + "theoretical_loss": 3.7010536502759193, + "tokens_seen": 863901696 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003728485456369107, + "loss": 2.8989, + "theoretical_loss": 3.7010262050226954, + "tokens_seen": 863967232 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037283851554663995, + "loss": 2.999, + "theoretical_loss": 3.7009987624341125, + "tokens_seen": 864032768 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037282848545636913, + "loss": 2.9549, + "theoretical_loss": 3.7009713225097105, + "tokens_seen": 864098304 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003728184553660983, + "loss": 2.9155, + "theoretical_loss": 3.7009438852490284, + "tokens_seen": 864163840 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003728084252758275, + "loss": 3.0174, + "theoretical_loss": 3.7009164506516066, + "tokens_seen": 864229376 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727983951855567, + "loss": 2.9608, + "theoretical_loss": 3.7008890187169827, + "tokens_seen": 864294912 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037278836509528585, + "loss": 2.9641, + "theoretical_loss": 3.700861589444698, + "tokens_seen": 864360448 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727783350050151, + "loss": 2.839, + "theoretical_loss": 3.7008341628342922, + "tokens_seen": 864425984 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727683049147442, + "loss": 2.8729, + "theoretical_loss": 3.7008067388853045, + "tokens_seen": 864491520 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037275827482447345, + "loss": 2.9509, + "theoretical_loss": 3.7007793175972754, + "tokens_seen": 864557056 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727482447342026, + "loss": 2.8995, + "theoretical_loss": 3.7007518989697443, + "tokens_seen": 864622592 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727382146439318, + "loss": 3.0586, + "theoretical_loss": 3.7007244830022525, + "tokens_seen": 864688128 + }, + { + "epoch": 2.05, + "learning_rate": 0.000372728184553661, + "loss": 2.9427, + "theoretical_loss": 3.700697069694339, + "tokens_seen": 864753664 + }, + { + "epoch": 2.05, + "learning_rate": 0.0003727181544633902, + "loss": 3.008, + "theoretical_loss": 3.700669659045546, + "tokens_seen": 864819200 + }, + { + "epoch": 2.05, + "learning_rate": 0.00037270812437311936, + "loss": 2.9381, + "theoretical_loss": 3.700642251055413, + "tokens_seen": 864884736 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037269809428284854, + "loss": 2.927, + "theoretical_loss": 3.7006148457234804, + "tokens_seen": 864950272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726880641925777, + "loss": 2.9514, + "theoretical_loss": 3.70058744304929, + "tokens_seen": 865015808 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1395602, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9346680641174316, + "objective/train/theoretical_loss": 3.7005737427087046, + "objective/train/tokens_used": 885508576, + "theoretical_loss": 3.7005737427087046, + "tokens_seen": 865048576 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037267803410230696, + "loss": 2.9966, + "theoretical_loss": 3.7005600430323824, + "tokens_seen": 865081344 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726680040120361, + "loss": 2.9911, + "theoretical_loss": 3.700532645672298, + "tokens_seen": 865146880 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726579739217653, + "loss": 3.0172, + "theoretical_loss": 3.700505250968578, + "tokens_seen": 865212416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726479438314945, + "loss": 2.998, + "theoretical_loss": 3.700477858920765, + "tokens_seen": 865277952 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726379137412237, + "loss": 2.9932, + "theoretical_loss": 3.7004504695283984, + "tokens_seen": 865343488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037262788365095286, + "loss": 2.9275, + "theoretical_loss": 3.700423082791022, + "tokens_seen": 865409024 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037261785356068204, + "loss": 2.9455, + "theoretical_loss": 3.700395698708175, + "tokens_seen": 865474560 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003726078234704112, + "loss": 2.9728, + "theoretical_loss": 3.700368317279401, + "tokens_seen": 865540096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037259779338014046, + "loss": 2.9496, + "theoretical_loss": 3.700340938504241, + "tokens_seen": 865605632 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003725877632898696, + "loss": 3.041, + "theoretical_loss": 3.700313562382237, + "tokens_seen": 865671168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003725777331995988, + "loss": 3.0692, + "theoretical_loss": 3.7002861889129313, + "tokens_seen": 865736704 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037256770310932795, + "loss": 2.9169, + "theoretical_loss": 3.7002588180958655, + "tokens_seen": 865802240 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003725576730190572, + "loss": 3.0265, + "theoretical_loss": 3.7002314499305826, + "tokens_seen": 865867776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037254764292878636, + "loss": 3.0445, + "theoretical_loss": 3.700204084416625, + "tokens_seen": 865933312 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037253761283851554, + "loss": 2.9892, + "theoretical_loss": 3.7001767215535346, + "tokens_seen": 865998848 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003725275827482447, + "loss": 2.8633, + "theoretical_loss": 3.7001493613408547, + "tokens_seen": 866064384 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037251755265797396, + "loss": 2.9397, + "theoretical_loss": 3.700122003778127, + "tokens_seen": 866129920 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003725075225677031, + "loss": 3.0491, + "theoretical_loss": 3.700094648864896, + "tokens_seen": 866195456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003724974924774323, + "loss": 2.8852, + "theoretical_loss": 3.700067296600703, + "tokens_seen": 866260992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037248746238716145, + "loss": 2.8813, + "theoretical_loss": 3.7000399469850924, + "tokens_seen": 866326528 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003724774322968907, + "loss": 2.956, + "theoretical_loss": 3.7000126000176063, + "tokens_seen": 866392064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037246740220661987, + "loss": 3.0795, + "theoretical_loss": 3.699985255697789, + "tokens_seen": 866457600 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037245737211634905, + "loss": 3.0422, + "theoretical_loss": 3.699957914025183, + "tokens_seen": 866523136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037244734202607823, + "loss": 2.9865, + "theoretical_loss": 3.6999305749993328, + "tokens_seen": 866588672 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003724373119358074, + "loss": 2.9379, + "theoretical_loss": 3.6999032386197817, + "tokens_seen": 866654208 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1398454, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1830058097839355, + "objective/train/theoretical_loss": 3.6998895714222253, + "objective/train/tokens_used": 887146976, + "theoretical_loss": 3.6998895714222253, + "tokens_seen": 866686976 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003724272818455366, + "loss": 2.9469, + "theoretical_loss": 3.699875904886073, + "tokens_seen": 866719744 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003724172517552658, + "loss": 2.9546, + "theoretical_loss": 3.699848573797751, + "tokens_seen": 866785280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037240722166499495, + "loss": 3.0658, + "theoretical_loss": 3.6998212453543595, + "tokens_seen": 866850816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723971915747242, + "loss": 2.9232, + "theoretical_loss": 3.699793919555443, + "tokens_seen": 866916352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723871614844533, + "loss": 2.9736, + "theoretical_loss": 3.6997665964005453, + "tokens_seen": 866981888 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037237713139418255, + "loss": 3.0296, + "theoretical_loss": 3.6997392758892107, + "tokens_seen": 867047424 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723671013039118, + "loss": 3.0703, + "theoretical_loss": 3.699711958020984, + "tokens_seen": 867112960 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723570712136409, + "loss": 2.839, + "theoretical_loss": 3.6996846427954093, + "tokens_seen": 867178496 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037234704112337015, + "loss": 3.0264, + "theoretical_loss": 3.699657330212032, + "tokens_seen": 867244032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037233701103309933, + "loss": 3.0874, + "theoretical_loss": 3.699630020270396, + "tokens_seen": 867309568 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723269809428285, + "loss": 2.9326, + "theoretical_loss": 3.699602712970047, + "tokens_seen": 867375104 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723169508525577, + "loss": 3.1257, + "theoretical_loss": 3.6995754083105297, + "tokens_seen": 867440640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003723069207622869, + "loss": 3.1505, + "theoretical_loss": 3.699548106291389, + "tokens_seen": 867506176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037229689067201605, + "loss": 2.9575, + "theoretical_loss": 3.6995208069121706, + "tokens_seen": 867571712 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003722868605817453, + "loss": 2.8577, + "theoretical_loss": 3.699493510172419, + "tokens_seen": 867637248 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003722768304914744, + "loss": 2.9594, + "theoretical_loss": 3.6994662160716807, + "tokens_seen": 867702784 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037226680040120365, + "loss": 3.1646, + "theoretical_loss": 3.6994389246095, + "tokens_seen": 867768320 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003722567703109328, + "loss": 2.9642, + "theoretical_loss": 3.699411635785424, + "tokens_seen": 867833856 + }, + { + "epoch": 2.06, + "learning_rate": 0.000372246740220662, + "loss": 2.9225, + "theoretical_loss": 3.699384349598998, + "tokens_seen": 867899392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003722367101303912, + "loss": 2.8618, + "theoretical_loss": 3.6993570660497674, + "tokens_seen": 867964928 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003722266800401204, + "loss": 3.0036, + "theoretical_loss": 3.6993297851372784, + "tokens_seen": 868030464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037221664994984956, + "loss": 3.0263, + "theoretical_loss": 3.699302506861078, + "tokens_seen": 868096000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037220661985957874, + "loss": 3.0682, + "theoretical_loss": 3.699275231220711, + "tokens_seen": 868161536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721965897693079, + "loss": 3.0396, + "theoretical_loss": 3.699247958215725, + "tokens_seen": 868227072 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037218655967903716, + "loss": 3.0163, + "theoretical_loss": 3.699220687845666, + "tokens_seen": 868292608 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1401149, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.953563928604126, + "objective/train/theoretical_loss": 3.6992070536485926, + "objective/train/tokens_used": 888785376, + "theoretical_loss": 3.6992070536485926, + "tokens_seen": 868325376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721765295887663, + "loss": 3.0597, + "theoretical_loss": 3.6991934201100807, + "tokens_seen": 868358144 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721664994984955, + "loss": 2.9484, + "theoretical_loss": 3.6991661550085153, + "tokens_seen": 868423680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721564694082247, + "loss": 2.639, + "theoretical_loss": 3.6991388925405175, + "tokens_seen": 868489216 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721464393179539, + "loss": 2.968, + "theoretical_loss": 3.6991116327056335, + "tokens_seen": 868554752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037213640922768306, + "loss": 2.911, + "theoretical_loss": 3.6990843755034106, + "tokens_seen": 868620288 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037212637913741224, + "loss": 3.0128, + "theoretical_loss": 3.6990571209333956, + "tokens_seen": 868685824 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003721163490471414, + "loss": 2.8139, + "theoretical_loss": 3.6990298689951366, + "tokens_seen": 868751360 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037210631895687066, + "loss": 3.0316, + "theoretical_loss": 3.69900261968818, + "tokens_seen": 868816896 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003720962888665998, + "loss": 3.0001, + "theoretical_loss": 3.6989753730120736, + "tokens_seen": 868882432 + }, + { + "epoch": 2.06, + "learning_rate": 0.000372086258776329, + "loss": 2.7932, + "theoretical_loss": 3.6989481289663653, + "tokens_seen": 868947968 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037207622868605815, + "loss": 2.9665, + "theoretical_loss": 3.698920887550603, + "tokens_seen": 869013504 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003720661985957874, + "loss": 3.0789, + "theoretical_loss": 3.698893648764334, + "tokens_seen": 869079040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037205616850551656, + "loss": 2.7902, + "theoretical_loss": 3.6988664126071056, + "tokens_seen": 869144576 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037204613841524575, + "loss": 2.8977, + "theoretical_loss": 3.698839179078467, + "tokens_seen": 869210112 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003720361083249749, + "loss": 2.9913, + "theoretical_loss": 3.6988119481779664, + "tokens_seen": 869275648 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037202607823470416, + "loss": 2.9667, + "theoretical_loss": 3.6987847199051513, + "tokens_seen": 869341184 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003720160481444333, + "loss": 2.8532, + "theoretical_loss": 3.6987574942595702, + "tokens_seen": 869406720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003720060180541625, + "loss": 2.9083, + "theoretical_loss": 3.6987302712407715, + "tokens_seen": 869472256 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037199598796389165, + "loss": 2.9841, + "theoretical_loss": 3.698703050848305, + "tokens_seen": 869537792 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003719859578736209, + "loss": 2.8875, + "theoretical_loss": 3.698675833081718, + "tokens_seen": 869603328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037197592778335007, + "loss": 2.9625, + "theoretical_loss": 3.6986486179405595, + "tokens_seen": 869668864 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037196589769307925, + "loss": 2.885, + "theoretical_loss": 3.698621405424379, + "tokens_seen": 869734400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037195586760280843, + "loss": 2.7393, + "theoretical_loss": 3.698594195532726, + "tokens_seen": 869799936 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003719458375125376, + "loss": 2.7695, + "theoretical_loss": 3.6985669882651475, + "tokens_seen": 869865472 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003719358074222668, + "loss": 2.9453, + "theoretical_loss": 3.6985397836211953, + "tokens_seen": 869931008 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1402530, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0285484790802, + "objective/train/theoretical_loss": 3.698526182282938, + "objective/train/tokens_used": 890423776, + "theoretical_loss": 3.698526182282938, + "tokens_seen": 869963776 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371925777331996, + "loss": 2.9929, + "theoretical_loss": 3.6985125816004176, + "tokens_seen": 869996544 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037191574724172515, + "loss": 2.8632, + "theoretical_loss": 3.698485382202364, + "tokens_seen": 870062080 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003719057171514544, + "loss": 2.8759, + "theoretical_loss": 3.698458185426583, + "tokens_seen": 870127616 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003718956870611835, + "loss": 2.9601, + "theoretical_loss": 3.6984309912726268, + "tokens_seen": 870193152 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037188565697091275, + "loss": 2.9189, + "theoretical_loss": 3.698403799740043, + "tokens_seen": 870258688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037187562688064193, + "loss": 2.9159, + "theoretical_loss": 3.6983766108283826, + "tokens_seen": 870324224 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003718655967903711, + "loss": 3.0472, + "theoretical_loss": 3.6983494245371955, + "tokens_seen": 870389760 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003718555667001003, + "loss": 3.0712, + "theoretical_loss": 3.6983222408660317, + "tokens_seen": 870455296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037184553660982953, + "loss": 2.9327, + "theoretical_loss": 3.6982950598144413, + "tokens_seen": 870520832 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037183550651955866, + "loss": 2.9, + "theoretical_loss": 3.6982678813819754, + "tokens_seen": 870586368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003718254764292879, + "loss": 2.9223, + "theoretical_loss": 3.6982407055681836, + "tokens_seen": 870651904 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371815446339017, + "loss": 2.9965, + "theoretical_loss": 3.698213532372617, + "tokens_seen": 870717440 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037180541624874625, + "loss": 3.063, + "theoretical_loss": 3.6981863617948263, + "tokens_seen": 870782976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037179538615847544, + "loss": 2.9875, + "theoretical_loss": 3.6981591938343623, + "tokens_seen": 870848512 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717853560682046, + "loss": 2.8986, + "theoretical_loss": 3.6981320284907757, + "tokens_seen": 870914048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717753259779338, + "loss": 2.9798, + "theoretical_loss": 3.698104865763618, + "tokens_seen": 870979584 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371765295887663, + "loss": 3.0214, + "theoretical_loss": 3.69807770565244, + "tokens_seen": 871045120 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037175526579739216, + "loss": 3.1173, + "theoretical_loss": 3.698050548156793, + "tokens_seen": 871110656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717452357071214, + "loss": 3.0723, + "theoretical_loss": 3.698023393276228, + "tokens_seen": 871176192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717352056168505, + "loss": 3.0814, + "theoretical_loss": 3.6979962410102973, + "tokens_seen": 871241728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037172517552657976, + "loss": 2.9173, + "theoretical_loss": 3.6979690913585523, + "tokens_seen": 871307264 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717151454363089, + "loss": 3.0547, + "theoretical_loss": 3.6979419443205446, + "tokens_seen": 871372800 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003717051153460381, + "loss": 2.894, + "theoretical_loss": 3.697914799895825, + "tokens_seen": 871438336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003716950852557673, + "loss": 3.0044, + "theoretical_loss": 3.6978876580839475, + "tokens_seen": 871503872 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003716850551654965, + "loss": 2.7258, + "theoretical_loss": 3.697860518884463, + "tokens_seen": 871569408 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1405377, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.062124013900757, + "objective/train/theoretical_loss": 3.6978469502642275, + "objective/train/tokens_used": 892062176, + "theoretical_loss": 3.6978469502642275, + "tokens_seen": 871602176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037167502507522566, + "loss": 2.9914, + "theoretical_loss": 3.697833382296923, + "tokens_seen": 871634944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003716649949849549, + "loss": 2.8784, + "theoretical_loss": 3.6978062483208807, + "tokens_seen": 871700480 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371654964894684, + "loss": 2.6977, + "theoretical_loss": 3.6977791169558882, + "tokens_seen": 871766016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037164493480441326, + "loss": 2.9683, + "theoretical_loss": 3.6977519882014978, + "tokens_seen": 871831552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003716349047141424, + "loss": 2.8157, + "theoretical_loss": 3.697724862057263, + "tokens_seen": 871897088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003716248746238716, + "loss": 2.8107, + "theoretical_loss": 3.6976977385227348, + "tokens_seen": 871962624 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037161484453360086, + "loss": 2.9125, + "theoretical_loss": 3.6976706175974674, + "tokens_seen": 872028160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037160481444333, + "loss": 2.9959, + "theoretical_loss": 3.6976434992810137, + "tokens_seen": 872093696 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003715947843530592, + "loss": 3.0811, + "theoretical_loss": 3.697616383572926, + "tokens_seen": 872159232 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037158475426278835, + "loss": 3.1543, + "theoretical_loss": 3.6975892704727578, + "tokens_seen": 872224768 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003715747241725176, + "loss": 2.9336, + "theoretical_loss": 3.6975621599800625, + "tokens_seen": 872290304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037156469408224676, + "loss": 2.8247, + "theoretical_loss": 3.697535052094393, + "tokens_seen": 872355840 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037155466399197595, + "loss": 2.8407, + "theoretical_loss": 3.6975079468153034, + "tokens_seen": 872421376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003715446339017051, + "loss": 2.8753, + "theoretical_loss": 3.697480844142347, + "tokens_seen": 872486912 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037153460381143436, + "loss": 2.9698, + "theoretical_loss": 3.6974537440750774, + "tokens_seen": 872552448 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003715245737211635, + "loss": 2.939, + "theoretical_loss": 3.6974266466130485, + "tokens_seen": 872617984 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003715145436308927, + "loss": 2.8742, + "theoretical_loss": 3.6973995517558143, + "tokens_seen": 872683520 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037150451354062185, + "loss": 2.8697, + "theoretical_loss": 3.6973724595029287, + "tokens_seen": 872749056 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003714944834503511, + "loss": 2.9549, + "theoretical_loss": 3.697345369853946, + "tokens_seen": 872814592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037148445336008027, + "loss": 2.8523, + "theoretical_loss": 3.6973182828084203, + "tokens_seen": 872880128 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037147442326980945, + "loss": 2.8897, + "theoretical_loss": 3.697291198365906, + "tokens_seen": 872945664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037146439317953863, + "loss": 2.7578, + "theoretical_loss": 3.6972641165259574, + "tokens_seen": 873011200 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003714543630892678, + "loss": 2.9176, + "theoretical_loss": 3.6972370372881285, + "tokens_seen": 873076736 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371444332998997, + "loss": 2.8898, + "theoretical_loss": 3.6972099606519757, + "tokens_seen": 873142272 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037143430290872623, + "loss": 2.7923, + "theoretical_loss": 3.6971828866170524, + "tokens_seen": 873207808 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1408001, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.722212553024292, + "objective/train/theoretical_loss": 3.697169350574913, + "objective/train/tokens_used": 893700576, + "theoretical_loss": 3.697169350574913, + "tokens_seen": 873240576 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037142427281845535, + "loss": 2.8109, + "theoretical_loss": 3.6971558151829136, + "tokens_seen": 873273344 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003714142427281846, + "loss": 2.9334, + "theoretical_loss": 3.697128746349115, + "tokens_seen": 873338880 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003714042126379137, + "loss": 3.0031, + "theoretical_loss": 3.6971016801152112, + "tokens_seen": 873404416 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037139418254764295, + "loss": 2.9421, + "theoretical_loss": 3.6970746164807573, + "tokens_seen": 873469952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037138415245737213, + "loss": 2.9584, + "theoretical_loss": 3.6970475554453093, + "tokens_seen": 873535488 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003713741223671013, + "loss": 2.8666, + "theoretical_loss": 3.697020497008422, + "tokens_seen": 873601024 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003713640922768305, + "loss": 2.8941, + "theoretical_loss": 3.6969934411696515, + "tokens_seen": 873666560 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037135406218655973, + "loss": 2.8707, + "theoretical_loss": 3.696966387928553, + "tokens_seen": 873732096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037134403209628886, + "loss": 2.9669, + "theoretical_loss": 3.6969393372846824, + "tokens_seen": 873797632 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003713340020060181, + "loss": 2.9456, + "theoretical_loss": 3.6969122892375954, + "tokens_seen": 873863168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003713239719157472, + "loss": 3.0226, + "theoretical_loss": 3.6968852437868485, + "tokens_seen": 873928704 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037131394182547645, + "loss": 3.0603, + "theoretical_loss": 3.696858200931997, + "tokens_seen": 873994240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037130391173520564, + "loss": 2.7451, + "theoretical_loss": 3.6968311606725983, + "tokens_seen": 874059776 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712938816449348, + "loss": 2.8508, + "theoretical_loss": 3.696804123008208, + "tokens_seen": 874125312 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371283851554664, + "loss": 2.8578, + "theoretical_loss": 3.696777087938382, + "tokens_seen": 874190848 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712738214643932, + "loss": 2.7934, + "theoretical_loss": 3.696750055462678, + "tokens_seen": 874256384 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037126379137412236, + "loss": 3.0452, + "theoretical_loss": 3.6967230255806522, + "tokens_seen": 874321920 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712537612838516, + "loss": 2.999, + "theoretical_loss": 3.696695998291861, + "tokens_seen": 874387456 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712437311935807, + "loss": 3.0487, + "theoretical_loss": 3.6966689735958616, + "tokens_seen": 874452992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037123370110330996, + "loss": 2.9554, + "theoretical_loss": 3.696641951492211, + "tokens_seen": 874518528 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712236710130391, + "loss": 2.9172, + "theoretical_loss": 3.696614931980466, + "tokens_seen": 874584064 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712136409227683, + "loss": 2.9855, + "theoretical_loss": 3.696587915060184, + "tokens_seen": 874649600 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003712036108324975, + "loss": 3.0627, + "theoretical_loss": 3.6965609007309226, + "tokens_seen": 874715136 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711935807422267, + "loss": 2.9102, + "theoretical_loss": 3.6965338889922386, + "tokens_seen": 874780672 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037118355065195586, + "loss": 2.9158, + "theoretical_loss": 3.6965068798436898, + "tokens_seen": 874846208 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1410704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.040503978729248, + "objective/train/theoretical_loss": 3.6964933762405776, + "objective/train/tokens_used": 895338976, + "theoretical_loss": 3.6964933762405776, + "tokens_seen": 874878976 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711735205616851, + "loss": 2.9777, + "theoretical_loss": 3.696479873284834, + "tokens_seen": 874911744 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711634904714142, + "loss": 2.9127, + "theoretical_loss": 3.6964528693152285, + "tokens_seen": 874977280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037115346038114346, + "loss": 3.1721, + "theoretical_loss": 3.6964258679344315, + "tokens_seen": 875042816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711434302908726, + "loss": 2.884, + "theoretical_loss": 3.6963988691420013, + "tokens_seen": 875108352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711334002006018, + "loss": 2.9607, + "theoretical_loss": 3.6963718729374957, + "tokens_seen": 875173888 + }, + { + "epoch": 2.06, + "learning_rate": 0.000371123370110331, + "loss": 2.8081, + "theoretical_loss": 3.6963448793204723, + "tokens_seen": 875239424 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003711133400200602, + "loss": 3.076, + "theoretical_loss": 3.6963178882904897, + "tokens_seen": 875304960 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037110330992978937, + "loss": 2.8398, + "theoretical_loss": 3.6962908998471065, + "tokens_seen": 875370496 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037109327983951855, + "loss": 2.9377, + "theoretical_loss": 3.6962639139898816, + "tokens_seen": 875436032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037108324974924773, + "loss": 2.9537, + "theoretical_loss": 3.696236930718373, + "tokens_seen": 875501568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037107321965897696, + "loss": 2.8792, + "theoretical_loss": 3.6962099500321393, + "tokens_seen": 875567104 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003710631895687061, + "loss": 2.9689, + "theoretical_loss": 3.6961829719307397, + "tokens_seen": 875632640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003710531594784353, + "loss": 2.9756, + "theoretical_loss": 3.696155996413733, + "tokens_seen": 875698176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037104312938816445, + "loss": 2.8992, + "theoretical_loss": 3.696129023480678, + "tokens_seen": 875763712 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003710330992978937, + "loss": 3.0619, + "theoretical_loss": 3.696102053131134, + "tokens_seen": 875829248 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037102306920762287, + "loss": 2.7398, + "theoretical_loss": 3.6960750853646607, + "tokens_seen": 875894784 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037101303911735205, + "loss": 3.0301, + "theoretical_loss": 3.696048120180817, + "tokens_seen": 875960320 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037100300902708123, + "loss": 2.9868, + "theoretical_loss": 3.696021157579162, + "tokens_seen": 876025856 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037099297893681047, + "loss": 2.9494, + "theoretical_loss": 3.6959941975592567, + "tokens_seen": 876091392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003709829488465396, + "loss": 2.937, + "theoretical_loss": 3.69596724012066, + "tokens_seen": 876156928 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037097291875626883, + "loss": 2.8667, + "theoretical_loss": 3.69594028526293, + "tokens_seen": 876222464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037096288866599796, + "loss": 2.8394, + "theoretical_loss": 3.6959133329856293, + "tokens_seen": 876288000 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003709528585757272, + "loss": 3.0699, + "theoretical_loss": 3.695886383288317, + "tokens_seen": 876353536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003709428284854564, + "loss": 2.9593, + "theoretical_loss": 3.6958594361705526, + "tokens_seen": 876419072 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037093279839518555, + "loss": 2.9392, + "theoretical_loss": 3.6958324916318963, + "tokens_seen": 876484608 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1413309, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.215353012084961, + "objective/train/theoretical_loss": 3.695819020329597, + "objective/train/tokens_used": 896977376, + "theoretical_loss": 3.695819020329597, + "tokens_seen": 876517376 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037092276830491474, + "loss": 2.7712, + "theoretical_loss": 3.6958055496719098, + "tokens_seen": 876550144 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003709127382146439, + "loss": 3.0212, + "theoretical_loss": 3.6957786102901515, + "tokens_seen": 876615680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003709027081243731, + "loss": 2.9289, + "theoretical_loss": 3.6957516734861837, + "tokens_seen": 876681216 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037089267803410233, + "loss": 3.0629, + "theoretical_loss": 3.6957247392595667, + "tokens_seen": 876746752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037088264794383146, + "loss": 3.0202, + "theoretical_loss": 3.6956978076098608, + "tokens_seen": 876812288 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003708726178535607, + "loss": 2.8927, + "theoretical_loss": 3.695670878536627, + "tokens_seen": 876877824 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037086258776328993, + "loss": 2.9814, + "theoretical_loss": 3.6956439520394264, + "tokens_seen": 876943360 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037085255767301906, + "loss": 2.8979, + "theoretical_loss": 3.6956170281178196, + "tokens_seen": 877008896 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003708425275827483, + "loss": 2.9931, + "theoretical_loss": 3.695590106771369, + "tokens_seen": 877074432 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003708324974924774, + "loss": 2.7197, + "theoretical_loss": 3.6955631879996345, + "tokens_seen": 877139968 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037082246740220665, + "loss": 3.0311, + "theoretical_loss": 3.6955362718021783, + "tokens_seen": 877205504 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037081243731193584, + "loss": 2.8879, + "theoretical_loss": 3.695509358178562, + "tokens_seen": 877271040 + }, + { + "epoch": 2.06, + "learning_rate": 0.000370802407221665, + "loss": 2.8503, + "theoretical_loss": 3.695482447128347, + "tokens_seen": 877336576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707923771313942, + "loss": 3.0062, + "theoretical_loss": 3.695455538651095, + "tokens_seen": 877402112 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707823470411234, + "loss": 2.9638, + "theoretical_loss": 3.695428632746368, + "tokens_seen": 877467648 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037077231695085256, + "loss": 3.0172, + "theoretical_loss": 3.6954017294137276, + "tokens_seen": 877533184 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707622868605818, + "loss": 2.9312, + "theoretical_loss": 3.695374828652736, + "tokens_seen": 877598720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707522567703109, + "loss": 3.0428, + "theoretical_loss": 3.695347930462956, + "tokens_seen": 877664256 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037074222668004016, + "loss": 2.8952, + "theoretical_loss": 3.695321034843949, + "tokens_seen": 877729792 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707321965897693, + "loss": 2.984, + "theoretical_loss": 3.695294141795278, + "tokens_seen": 877795328 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707221664994985, + "loss": 2.9639, + "theoretical_loss": 3.695267251316505, + "tokens_seen": 877860864 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707121364092277, + "loss": 2.8931, + "theoretical_loss": 3.6952403634071924, + "tokens_seen": 877926400 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003707021063189569, + "loss": 2.948, + "theoretical_loss": 3.695213478066904, + "tokens_seen": 877991936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037069207622868606, + "loss": 3.0408, + "theoretical_loss": 3.695186595295201, + "tokens_seen": 878057472 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003706820461384153, + "loss": 2.9622, + "theoretical_loss": 3.6951597150916484, + "tokens_seen": 878123008 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1416301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0618767738342285, + "objective/train/theoretical_loss": 3.695146275952791, + "objective/train/tokens_used": 898615776, + "theoretical_loss": 3.695146275952791, + "tokens_seen": 878155776 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003706720160481444, + "loss": 3.0452, + "theoretical_loss": 3.6951328374558075, + "tokens_seen": 878188544 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037066198595787366, + "loss": 2.7682, + "theoretical_loss": 3.6951059623872418, + "tokens_seen": 878254080 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003706519558676028, + "loss": 2.9614, + "theoretical_loss": 3.695079089885515, + "tokens_seen": 878319616 + }, + { + "epoch": 2.06, + "learning_rate": 0.000370641925777332, + "loss": 2.8131, + "theoretical_loss": 3.69505221995019, + "tokens_seen": 878385152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003706318956870612, + "loss": 2.9609, + "theoretical_loss": 3.6950253525808305, + "tokens_seen": 878450688 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003706218655967904, + "loss": 2.9054, + "theoretical_loss": 3.694998487777, + "tokens_seen": 878516224 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037061183550651957, + "loss": 2.8068, + "theoretical_loss": 3.6949716255382623, + "tokens_seen": 878581760 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037060180541624875, + "loss": 2.7577, + "theoretical_loss": 3.6949447658641805, + "tokens_seen": 878647296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037059177532597793, + "loss": 2.8784, + "theoretical_loss": 3.6949179087543196, + "tokens_seen": 878712832 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037058174523570716, + "loss": 2.9443, + "theoretical_loss": 3.6948910542082425, + "tokens_seen": 878778368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003705717151454363, + "loss": 2.9398, + "theoretical_loss": 3.6948642022255136, + "tokens_seen": 878843904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003705616850551655, + "loss": 3.0307, + "theoretical_loss": 3.694837352805698, + "tokens_seen": 878909440 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037055165496489465, + "loss": 3.0123, + "theoretical_loss": 3.6948105059483587, + "tokens_seen": 878974976 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003705416248746239, + "loss": 2.8622, + "theoretical_loss": 3.694783661653061, + "tokens_seen": 879040512 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037053159478435307, + "loss": 2.9894, + "theoretical_loss": 3.6947568199193688, + "tokens_seen": 879106048 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037052156469408225, + "loss": 2.9048, + "theoretical_loss": 3.694729980746847, + "tokens_seen": 879171584 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037051153460381143, + "loss": 2.9832, + "theoretical_loss": 3.694703144135061, + "tokens_seen": 879237120 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037050150451354067, + "loss": 2.9561, + "theoretical_loss": 3.6946763100835742, + "tokens_seen": 879302656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003704914744232698, + "loss": 3.0623, + "theoretical_loss": 3.694649478591952, + "tokens_seen": 879368192 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037048144433299903, + "loss": 3.0416, + "theoretical_loss": 3.6946226496597605, + "tokens_seen": 879433728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037047141424272816, + "loss": 2.8346, + "theoretical_loss": 3.694595823286564, + "tokens_seen": 879499264 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003704613841524574, + "loss": 2.929, + "theoretical_loss": 3.6945689994719277, + "tokens_seen": 879564800 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003704513540621866, + "loss": 2.853, + "theoretical_loss": 3.6945421782154177, + "tokens_seen": 879630336 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037044132397191575, + "loss": 2.9578, + "theoretical_loss": 3.6945153595165983, + "tokens_seen": 879695872 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037043129388164494, + "loss": 3.0255, + "theoretical_loss": 3.694488543375036, + "tokens_seen": 879761408 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1419099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8070034980773926, + "objective/train/theoretical_loss": 3.6944751362630903, + "objective/train/tokens_used": 900254176, + "theoretical_loss": 3.6944751362630903, + "tokens_seen": 879794176 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003704212637913741, + "loss": 3.001, + "theoretical_loss": 3.694461729790296, + "tokens_seen": 879826944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003704112337011033, + "loss": 2.844, + "theoretical_loss": 3.694434918761944, + "tokens_seen": 879892480 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037040120361083253, + "loss": 2.9233, + "theoretical_loss": 3.6944081102895474, + "tokens_seen": 879958016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037039117352056166, + "loss": 3.0624, + "theoretical_loss": 3.6943813043726696, + "tokens_seen": 880023552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003703811434302909, + "loss": 2.976, + "theoretical_loss": 3.6943545010108787, + "tokens_seen": 880089088 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003703711133400201, + "loss": 3.023, + "theoretical_loss": 3.6943277002037407, + "tokens_seen": 880154624 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037036108324974926, + "loss": 2.9619, + "theoretical_loss": 3.694300901950821, + "tokens_seen": 880220160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037035105315947844, + "loss": 2.9241, + "theoretical_loss": 3.6942741062516866, + "tokens_seen": 880285696 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003703410230692076, + "loss": 3.0631, + "theoretical_loss": 3.6942473131059037, + "tokens_seen": 880351232 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003703309929789368, + "loss": 2.9422, + "theoretical_loss": 3.69422052251304, + "tokens_seen": 880416768 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037032096288866604, + "loss": 2.8485, + "theoretical_loss": 3.694193734472661, + "tokens_seen": 880482304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037031093279839516, + "loss": 3.0103, + "theoretical_loss": 3.6941669489843343, + "tokens_seen": 880547840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003703009027081244, + "loss": 2.9923, + "theoretical_loss": 3.6941401660476263, + "tokens_seen": 880613376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003702908726178535, + "loss": 2.9025, + "theoretical_loss": 3.6941133856621047, + "tokens_seen": 880678912 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037028084252758276, + "loss": 2.9468, + "theoretical_loss": 3.6940866078273364, + "tokens_seen": 880744448 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037027081243731194, + "loss": 2.9838, + "theoretical_loss": 3.694059832542888, + "tokens_seen": 880809984 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003702607823470411, + "loss": 3.1091, + "theoretical_loss": 3.694033059808328, + "tokens_seen": 880875520 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003702507522567703, + "loss": 2.9587, + "theoretical_loss": 3.694006289623223, + "tokens_seen": 880941056 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003702407221664995, + "loss": 2.9289, + "theoretical_loss": 3.693979521987141, + "tokens_seen": 881006592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037023069207622867, + "loss": 2.8518, + "theoretical_loss": 3.69395275689965, + "tokens_seen": 881072128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003702206619859579, + "loss": 2.9453, + "theoretical_loss": 3.6939259943603173, + "tokens_seen": 881137664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037021063189568703, + "loss": 3.0948, + "theoretical_loss": 3.693899234368711, + "tokens_seen": 881203200 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037020060180541626, + "loss": 2.8597, + "theoretical_loss": 3.693872476924399, + "tokens_seen": 881268736 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037019057171514544, + "loss": 3.0662, + "theoretical_loss": 3.6938457220269494, + "tokens_seen": 881334272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003701805416248746, + "loss": 2.9678, + "theoretical_loss": 3.693818969675931, + "tokens_seen": 881399808 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1421904, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.955669403076172, + "objective/train/theoretical_loss": 3.693805594455198, + "objective/train/tokens_used": 901892576, + "theoretical_loss": 3.693805594455198, + "tokens_seen": 881432576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003701705115346038, + "loss": 2.9477, + "theoretical_loss": 3.693792219870911, + "tokens_seen": 881465344 + }, + { + "epoch": 2.06, + "learning_rate": 0.000370160481444333, + "loss": 2.9464, + "theoretical_loss": 3.693765472611459, + "tokens_seen": 881530880 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037015045135406217, + "loss": 2.9619, + "theoretical_loss": 3.6937387278971423, + "tokens_seen": 881596416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003701404212637914, + "loss": 2.8495, + "theoretical_loss": 3.693711985727531, + "tokens_seen": 881661952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037013039117352053, + "loss": 2.9577, + "theoretical_loss": 3.693685246102193, + "tokens_seen": 881727488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037012036108324977, + "loss": 2.8593, + "theoretical_loss": 3.693658509020697, + "tokens_seen": 881793024 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037011033099297895, + "loss": 2.8322, + "theoretical_loss": 3.693631774482612, + "tokens_seen": 881858560 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037010030090270813, + "loss": 2.946, + "theoretical_loss": 3.693605042487508, + "tokens_seen": 881924096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037009027081243736, + "loss": 2.8971, + "theoretical_loss": 3.693578313034953, + "tokens_seen": 881989632 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003700802407221665, + "loss": 3.0529, + "theoretical_loss": 3.693551586124517, + "tokens_seen": 882055168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003700702106318957, + "loss": 2.9651, + "theoretical_loss": 3.693524861755769, + "tokens_seen": 882120704 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037006018054162485, + "loss": 3.1103, + "theoretical_loss": 3.6934981399282787, + "tokens_seen": 882186240 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003700501504513541, + "loss": 2.8133, + "theoretical_loss": 3.6934714206416155, + "tokens_seen": 882251776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037004012036108327, + "loss": 2.9703, + "theoretical_loss": 3.693444703895349, + "tokens_seen": 882317312 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037003009027081245, + "loss": 2.8875, + "theoretical_loss": 3.693417989689049, + "tokens_seen": 882382848 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037002006018054163, + "loss": 2.8752, + "theoretical_loss": 3.693391278022286, + "tokens_seen": 882448384 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037001003009027087, + "loss": 2.9391, + "theoretical_loss": 3.69336456889463, + "tokens_seen": 882513920 + }, + { + "epoch": 2.06, + "learning_rate": 0.00037, + "loss": 2.9792, + "theoretical_loss": 3.6933378623056505, + "tokens_seen": 882579456 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036998996990972923, + "loss": 2.907, + "theoretical_loss": 3.6933111582549176, + "tokens_seen": 882644992 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036997993981945836, + "loss": 2.9886, + "theoretical_loss": 3.693284456742002, + "tokens_seen": 882710528 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003699699097291876, + "loss": 2.8707, + "theoretical_loss": 3.693257757766474, + "tokens_seen": 882776064 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003699598796389168, + "loss": 2.9126, + "theoretical_loss": 3.6932310613279045, + "tokens_seen": 882841600 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036994984954864595, + "loss": 2.9168, + "theoretical_loss": 3.6932043674258637, + "tokens_seen": 882907136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036993981945837514, + "loss": 3.019, + "theoretical_loss": 3.693177676059922, + "tokens_seen": 882972672 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003699297893681043, + "loss": 3.1493, + "theoretical_loss": 3.693150987229652, + "tokens_seen": 883038208 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1423225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.996013641357422, + "objective/train/theoretical_loss": 3.693137643765259, + "objective/train/tokens_used": 903530976, + "theoretical_loss": 3.693137643765259, + "tokens_seen": 883070976 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003699197592778335, + "loss": 2.8904, + "theoretical_loss": 3.6931243009346226, + "tokens_seen": 883103744 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036990972918756273, + "loss": 3.0048, + "theoretical_loss": 3.6930976171744057, + "tokens_seen": 883169280 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036989969909729186, + "loss": 2.7666, + "theoretical_loss": 3.6930709359485725, + "tokens_seen": 883234816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003698896690070211, + "loss": 2.7699, + "theoretical_loss": 3.693044257256694, + "tokens_seen": 883300352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003698796389167503, + "loss": 3.1069, + "theoretical_loss": 3.693017581098342, + "tokens_seen": 883365888 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036986960882647946, + "loss": 2.9604, + "theoretical_loss": 3.6929909074730873, + "tokens_seen": 883431424 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036985957873620864, + "loss": 2.9147, + "theoretical_loss": 3.692964236380502, + "tokens_seen": 883496960 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003698495486459378, + "loss": 2.9022, + "theoretical_loss": 3.692937567820158, + "tokens_seen": 883562496 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369839518555667, + "loss": 3.04, + "theoretical_loss": 3.6929109017916266, + "tokens_seen": 883628032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036982948846539624, + "loss": 2.8905, + "theoretical_loss": 3.6928842382944795, + "tokens_seen": 883693568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036981945837512536, + "loss": 3.0069, + "theoretical_loss": 3.6928575773282897, + "tokens_seen": 883759104 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003698094282848546, + "loss": 2.8383, + "theoretical_loss": 3.6928309188926285, + "tokens_seen": 883824640 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003697993981945837, + "loss": 2.8913, + "theoretical_loss": 3.692804262987068, + "tokens_seen": 883890176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036978936810431296, + "loss": 2.8032, + "theoretical_loss": 3.6927776096111806, + "tokens_seen": 883955712 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036977933801404214, + "loss": 2.9121, + "theoretical_loss": 3.692750958764539, + "tokens_seen": 884021248 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003697693079237713, + "loss": 2.9724, + "theoretical_loss": 3.6927243104467156, + "tokens_seen": 884086784 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003697592778335005, + "loss": 2.9441, + "theoretical_loss": 3.692697664657283, + "tokens_seen": 884152320 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003697492477432297, + "loss": 2.9012, + "theoretical_loss": 3.692671021395814, + "tokens_seen": 884217856 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036973921765295887, + "loss": 2.9747, + "theoretical_loss": 3.692644380661881, + "tokens_seen": 884283392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003697291875626881, + "loss": 2.872, + "theoretical_loss": 3.6926177424550573, + "tokens_seen": 884348928 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036971915747241723, + "loss": 2.8875, + "theoretical_loss": 3.692591106774916, + "tokens_seen": 884414464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036970912738214646, + "loss": 2.7893, + "theoretical_loss": 3.6925644736210304, + "tokens_seen": 884480000 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036969909729187565, + "loss": 3.0573, + "theoretical_loss": 3.692537842992973, + "tokens_seen": 884545536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003696890672016048, + "loss": 3.0199, + "theoretical_loss": 3.6925112148903176, + "tokens_seen": 884611072 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369679037111334, + "loss": 2.854, + "theoretical_loss": 3.692484589312638, + "tokens_seen": 884676608 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1426111, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4226863384246826, + "objective/train/theoretical_loss": 3.692471277470531, + "objective/train/tokens_used": 905169376, + "theoretical_loss": 3.692471277470531, + "tokens_seen": 884709376 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003696690070210632, + "loss": 2.8211, + "theoretical_loss": 3.6924579662595076, + "tokens_seen": 884742144 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036965897693079237, + "loss": 3.0346, + "theoretical_loss": 3.6924313457304994, + "tokens_seen": 884807680 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003696489468405216, + "loss": 2.9903, + "theoretical_loss": 3.692404727725188, + "tokens_seen": 884873216 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036963891675025073, + "loss": 3.0317, + "theoretical_loss": 3.692378112243146, + "tokens_seen": 884938752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036962888665997997, + "loss": 2.8753, + "theoretical_loss": 3.6923514992839497, + "tokens_seen": 885004288 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003696188565697091, + "loss": 2.8462, + "theoretical_loss": 3.6923248888471707, + "tokens_seen": 885069824 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036960882647943833, + "loss": 2.9009, + "theoretical_loss": 3.692298280932384, + "tokens_seen": 885135360 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003695987963891675, + "loss": 3.0396, + "theoretical_loss": 3.692271675539165, + "tokens_seen": 885200896 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003695887662988967, + "loss": 2.9135, + "theoretical_loss": 3.6922450726670863, + "tokens_seen": 885266432 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036957873620862587, + "loss": 2.9877, + "theoretical_loss": 3.692218472315724, + "tokens_seen": 885331968 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036956870611835505, + "loss": 2.883, + "theoretical_loss": 3.692191874484651, + "tokens_seen": 885397504 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036955867602808424, + "loss": 3.0721, + "theoretical_loss": 3.6921652791734436, + "tokens_seen": 885463040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036954864593781347, + "loss": 2.9859, + "theoretical_loss": 3.692138686381676, + "tokens_seen": 885528576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003695386158475426, + "loss": 2.7916, + "theoretical_loss": 3.6921120961089224, + "tokens_seen": 885594112 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036952858575727183, + "loss": 2.8957, + "theoretical_loss": 3.6920855083547583, + "tokens_seen": 885659648 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369518555667001, + "loss": 2.9705, + "theoretical_loss": 3.6920589231187595, + "tokens_seen": 885725184 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003695085255767302, + "loss": 3.0179, + "theoretical_loss": 3.6920323404005, + "tokens_seen": 885790720 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003694984954864594, + "loss": 2.8592, + "theoretical_loss": 3.6920057601995566, + "tokens_seen": 885856256 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036948846539618856, + "loss": 2.9114, + "theoretical_loss": 3.691979182515503, + "tokens_seen": 885921792 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036947843530591774, + "loss": 2.9661, + "theoretical_loss": 3.6919526073479156, + "tokens_seen": 885987328 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369468405215647, + "loss": 2.7145, + "theoretical_loss": 3.6919260346963703, + "tokens_seen": 886052864 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003694583751253761, + "loss": 2.9934, + "theoretical_loss": 3.691899464560443, + "tokens_seen": 886118400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036944834503510534, + "loss": 2.9879, + "theoretical_loss": 3.691872896939708, + "tokens_seen": 886183936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036943831494483446, + "loss": 2.8279, + "theoretical_loss": 3.6918463318337422, + "tokens_seen": 886249472 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003694282848545637, + "loss": 2.9059, + "theoretical_loss": 3.691819769242122, + "tokens_seen": 886315008 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1428912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1128909587860107, + "objective/train/theoretical_loss": 3.6918064888890587, + "objective/train/tokens_used": 906807776, + "theoretical_loss": 3.6918064888890587, + "tokens_seen": 886347776 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003694182547642929, + "loss": 3.0174, + "theoretical_loss": 3.691793209164423, + "tokens_seen": 886380544 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036940822467402206, + "loss": 2.8451, + "theoretical_loss": 3.691766651600222, + "tokens_seen": 886446080 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036939819458375124, + "loss": 3.1236, + "theoretical_loss": 3.6917400965490943, + "tokens_seen": 886511616 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003693881644934805, + "loss": 2.8071, + "theoretical_loss": 3.691713544010618, + "tokens_seen": 886577152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003693781344032096, + "loss": 3.0266, + "theoretical_loss": 3.6916869939843675, + "tokens_seen": 886642688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036936810431293884, + "loss": 2.7336, + "theoretical_loss": 3.691660446469921, + "tokens_seen": 886708224 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369358074222668, + "loss": 3.0707, + "theoretical_loss": 3.6916339014668553, + "tokens_seen": 886773760 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003693480441323972, + "loss": 2.9802, + "theoretical_loss": 3.6916073589747462, + "tokens_seen": 886839296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036933801404212644, + "loss": 2.9887, + "theoretical_loss": 3.6915808189931716, + "tokens_seen": 886904832 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036932798395185556, + "loss": 2.9595, + "theoretical_loss": 3.691554281521708, + "tokens_seen": 886970368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003693179538615848, + "loss": 2.9595, + "theoretical_loss": 3.6915277465599328, + "tokens_seen": 887035904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003693079237713139, + "loss": 3.0291, + "theoretical_loss": 3.691501214107423, + "tokens_seen": 887101440 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036929789368104316, + "loss": 2.9052, + "theoretical_loss": 3.6914746841637562, + "tokens_seen": 887166976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036928786359077234, + "loss": 2.8567, + "theoretical_loss": 3.6914481567285105, + "tokens_seen": 887232512 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003692778335005015, + "loss": 2.9908, + "theoretical_loss": 3.691421631801262, + "tokens_seen": 887298048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003692678034102307, + "loss": 3.0306, + "theoretical_loss": 3.6913951093815895, + "tokens_seen": 887363584 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003692577733199599, + "loss": 2.8834, + "theoretical_loss": 3.69136858946907, + "tokens_seen": 887429120 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036924774322968907, + "loss": 2.9722, + "theoretical_loss": 3.691342072063282, + "tokens_seen": 887494656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003692377131394183, + "loss": 2.889, + "theoretical_loss": 3.691315557163804, + "tokens_seen": 887560192 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036922768304914743, + "loss": 2.9122, + "theoretical_loss": 3.6912890447702127, + "tokens_seen": 887625728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036921765295887666, + "loss": 2.8618, + "theoretical_loss": 3.691262534882087, + "tokens_seen": 887691264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036920762286860585, + "loss": 2.7728, + "theoretical_loss": 3.6912360274990057, + "tokens_seen": 887756800 + }, + { + "epoch": 2.06, + "learning_rate": 0.000369197592778335, + "loss": 2.8042, + "theoretical_loss": 3.6912095226205457, + "tokens_seen": 887822336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003691875626880642, + "loss": 2.91, + "theoretical_loss": 3.691183020246287, + "tokens_seen": 887887872 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003691775325977934, + "loss": 3.0192, + "theoretical_loss": 3.6911565203758077, + "tokens_seen": 887953408 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1431723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8450140953063965, + "objective/train/theoretical_loss": 3.691143271379353, + "objective/train/tokens_used": 908446176, + "theoretical_loss": 3.691143271379353, + "tokens_seen": 887986176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036916750250752257, + "loss": 2.9832, + "theoretical_loss": 3.691130023008686, + "tokens_seen": 888018944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003691574724172518, + "loss": 2.842, + "theoretical_loss": 3.691103528144501, + "tokens_seen": 888084480 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036914744232698093, + "loss": 2.9783, + "theoretical_loss": 3.6910770357828318, + "tokens_seen": 888150016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036913741223671017, + "loss": 2.9666, + "theoretical_loss": 3.6910505459232574, + "tokens_seen": 888215552 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003691273821464393, + "loss": 2.9205, + "theoretical_loss": 3.6910240585653566, + "tokens_seen": 888281088 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036911735205616853, + "loss": 2.8728, + "theoretical_loss": 3.6909975737087093, + "tokens_seen": 888346624 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003691073219658977, + "loss": 2.9806, + "theoretical_loss": 3.690971091352894, + "tokens_seen": 888412160 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690972918756269, + "loss": 2.7468, + "theoretical_loss": 3.69094461149749, + "tokens_seen": 888477696 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690872617853561, + "loss": 2.8739, + "theoretical_loss": 3.690918134142078, + "tokens_seen": 888543232 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036907723169508525, + "loss": 3.0151, + "theoretical_loss": 3.690891659286236, + "tokens_seen": 888608768 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036906720160481444, + "loss": 2.978, + "theoretical_loss": 3.690865186929545, + "tokens_seen": 888674304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036905717151454367, + "loss": 2.9245, + "theoretical_loss": 3.6908387170715837, + "tokens_seen": 888739840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690471414242728, + "loss": 2.9091, + "theoretical_loss": 3.690812249711933, + "tokens_seen": 888805376 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036903711133400203, + "loss": 2.9714, + "theoretical_loss": 3.690785784850173, + "tokens_seen": 888870912 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690270812437312, + "loss": 2.8345, + "theoretical_loss": 3.690759322485883, + "tokens_seen": 888936448 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690170511534604, + "loss": 2.8485, + "theoretical_loss": 3.6907328626186438, + "tokens_seen": 889001984 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003690070210631896, + "loss": 2.7914, + "theoretical_loss": 3.6907064052480347, + "tokens_seen": 889067520 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036899699097291876, + "loss": 2.8907, + "theoretical_loss": 3.690679950373638, + "tokens_seen": 889133056 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036898696088264794, + "loss": 2.8874, + "theoretical_loss": 3.690653497995032, + "tokens_seen": 889198592 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003689769307923772, + "loss": 2.8788, + "theoretical_loss": 3.690627048111799, + "tokens_seen": 889264128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003689669007021063, + "loss": 3.011, + "theoretical_loss": 3.6906006007235197, + "tokens_seen": 889329664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036895687061183554, + "loss": 2.9218, + "theoretical_loss": 3.6905741558297738, + "tokens_seen": 889395200 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036894684052156466, + "loss": 3.028, + "theoretical_loss": 3.690547713430143, + "tokens_seen": 889460736 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003689368104312939, + "loss": 2.8785, + "theoretical_loss": 3.6905212735242086, + "tokens_seen": 889526272 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003689267803410231, + "loss": 2.9178, + "theoretical_loss": 3.690494836111551, + "tokens_seen": 889591808 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1434324, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.97959041595459, + "objective/train/theoretical_loss": 3.69048161834007, + "objective/train/tokens_used": 910084576, + "theoretical_loss": 3.69048161834007, + "tokens_seen": 889624576 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036891675025075226, + "loss": 2.9487, + "theoretical_loss": 3.690468401191751, + "tokens_seen": 889657344 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036890672016048144, + "loss": 2.9511, + "theoretical_loss": 3.6904419687643917, + "tokens_seen": 889722880 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688966900702107, + "loss": 2.9729, + "theoretical_loss": 3.690415538829053, + "tokens_seen": 889788416 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688866599799398, + "loss": 3.0456, + "theoretical_loss": 3.6903891113853176, + "tokens_seen": 889853952 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036887662988966904, + "loss": 2.7608, + "theoretical_loss": 3.6903626864327657, + "tokens_seen": 889919488 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036886659979939817, + "loss": 2.9971, + "theoretical_loss": 3.69033626397098, + "tokens_seen": 889985024 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688565697091274, + "loss": 2.9115, + "theoretical_loss": 3.6903098439995423, + "tokens_seen": 890050560 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688465396188566, + "loss": 2.8703, + "theoretical_loss": 3.690283426518034, + "tokens_seen": 890116096 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036883650952858576, + "loss": 2.9461, + "theoretical_loss": 3.690257011526038, + "tokens_seen": 890181632 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036882647943831494, + "loss": 2.9212, + "theoretical_loss": 3.6902305990231357, + "tokens_seen": 890247168 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688164493480441, + "loss": 2.9688, + "theoretical_loss": 3.69020418900891, + "tokens_seen": 890312704 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003688064192577733, + "loss": 2.95, + "theoretical_loss": 3.6901777814829426, + "tokens_seen": 890378240 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036879638916750254, + "loss": 2.8861, + "theoretical_loss": 3.690151376444816, + "tokens_seen": 890443776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036878635907723167, + "loss": 2.9342, + "theoretical_loss": 3.690124973894113, + "tokens_seen": 890509312 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003687763289869609, + "loss": 3.0756, + "theoretical_loss": 3.690098573830417, + "tokens_seen": 890574848 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036876629889669003, + "loss": 2.9805, + "theoretical_loss": 3.6900721762533086, + "tokens_seen": 890640384 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036875626880641927, + "loss": 2.8495, + "theoretical_loss": 3.6900457811623726, + "tokens_seen": 890705920 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036874623871614845, + "loss": 2.845, + "theoretical_loss": 3.6900193885571917, + "tokens_seen": 890771456 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036873620862587763, + "loss": 2.9257, + "theoretical_loss": 3.689992998437348, + "tokens_seen": 890836992 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003687261785356068, + "loss": 2.8406, + "theoretical_loss": 3.6899666108024256, + "tokens_seen": 890902528 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036871614844533605, + "loss": 3.0054, + "theoretical_loss": 3.6899402256520073, + "tokens_seen": 890968064 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036870611835506517, + "loss": 2.9952, + "theoretical_loss": 3.6899138429856766, + "tokens_seen": 891033600 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686960882647944, + "loss": 3.046, + "theoretical_loss": 3.6898874628030165, + "tokens_seen": 891099136 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036868605817452353, + "loss": 2.9871, + "theoretical_loss": 3.689861085103611, + "tokens_seen": 891164672 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036867602808425277, + "loss": 2.896, + "theoretical_loss": 3.6898347098870437, + "tokens_seen": 891230208 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1437051, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0074684619903564, + "objective/train/theoretical_loss": 3.6898215232096945, + "objective/train/tokens_used": 911722976, + "theoretical_loss": 3.6898215232096945, + "tokens_seen": 891262976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036866599799398195, + "loss": 3.034, + "theoretical_loss": 3.6898083371528987, + "tokens_seen": 891295744 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036865596790371113, + "loss": 2.9056, + "theoretical_loss": 3.6897819669007594, + "tokens_seen": 891361280 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686459378134403, + "loss": 2.9934, + "theoretical_loss": 3.6897555991302093, + "tokens_seen": 891426816 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686359077231695, + "loss": 2.8257, + "theoretical_loss": 3.6897292338408336, + "tokens_seen": 891492352 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686258776328987, + "loss": 2.8424, + "theoretical_loss": 3.6897028710322157, + "tokens_seen": 891557888 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686158475426279, + "loss": 2.9012, + "theoretical_loss": 3.6896765107039395, + "tokens_seen": 891623424 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003686058174523571, + "loss": 2.9629, + "theoretical_loss": 3.6896501528555907, + "tokens_seen": 891688960 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003685957873620863, + "loss": 3.0631, + "theoretical_loss": 3.6896237974867523, + "tokens_seen": 891754496 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036858575727181545, + "loss": 2.784, + "theoretical_loss": 3.689597444597009, + "tokens_seen": 891820032 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036857572718154464, + "loss": 2.982, + "theoretical_loss": 3.6895710941859465, + "tokens_seen": 891885568 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036856569709127387, + "loss": 2.9465, + "theoretical_loss": 3.6895447462531488, + "tokens_seen": 891951104 + }, + { + "epoch": 2.06, + "learning_rate": 0.000368555667001003, + "loss": 2.9277, + "theoretical_loss": 3.6895184007982014, + "tokens_seen": 892016640 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036854563691073223, + "loss": 2.9018, + "theoretical_loss": 3.6894920578206882, + "tokens_seen": 892082176 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003685356068204614, + "loss": 2.9584, + "theoretical_loss": 3.689465717320195, + "tokens_seen": 892147712 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003685255767301906, + "loss": 3.0248, + "theoretical_loss": 3.6894393792963065, + "tokens_seen": 892213248 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003685155466399198, + "loss": 3.0666, + "theoretical_loss": 3.6894130437486083, + "tokens_seen": 892278784 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036850551654964896, + "loss": 2.9353, + "theoretical_loss": 3.689386710676686, + "tokens_seen": 892344320 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036849548645937814, + "loss": 2.9362, + "theoretical_loss": 3.689360380080125, + "tokens_seen": 892409856 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003684854563691074, + "loss": 2.8686, + "theoretical_loss": 3.68933405195851, + "tokens_seen": 892475392 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003684754262788365, + "loss": 3.0815, + "theoretical_loss": 3.6893077263114273, + "tokens_seen": 892540928 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036846539618856574, + "loss": 2.9336, + "theoretical_loss": 3.6892814031384624, + "tokens_seen": 892606464 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036845536609829486, + "loss": 2.8535, + "theoretical_loss": 3.689255082439202, + "tokens_seen": 892672000 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003684453360080241, + "loss": 2.9302, + "theoretical_loss": 3.6892287642132313, + "tokens_seen": 892737536 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003684353059177533, + "loss": 2.9513, + "theoretical_loss": 3.6892024484601356, + "tokens_seen": 892803072 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036842527582748246, + "loss": 2.8754, + "theoretical_loss": 3.689176135179503, + "tokens_seen": 892868608 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1438457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0204591751098633, + "objective/train/theoretical_loss": 3.6891629794662304, + "objective/train/tokens_used": 913361376, + "theoretical_loss": 3.6891629794662304, + "tokens_seen": 892901376 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036841524573721164, + "loss": 2.9751, + "theoretical_loss": 3.6891498243709178, + "tokens_seen": 892934144 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003684052156469409, + "loss": 2.8257, + "theoretical_loss": 3.689123516033968, + "tokens_seen": 892999680 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036839518555667, + "loss": 3.0095, + "theoretical_loss": 3.6890972101682387, + "tokens_seen": 893065216 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036838515546639924, + "loss": 2.8473, + "theoretical_loss": 3.689070906773317, + "tokens_seen": 893130752 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036837512537612837, + "loss": 2.924, + "theoretical_loss": 3.6890446058487893, + "tokens_seen": 893196288 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003683650952858576, + "loss": 3.0471, + "theoretical_loss": 3.6890183073942433, + "tokens_seen": 893261824 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003683550651955868, + "loss": 2.9204, + "theoretical_loss": 3.6889920114092645, + "tokens_seen": 893327360 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036834503510531596, + "loss": 2.8766, + "theoretical_loss": 3.6889657178934407, + "tokens_seen": 893392896 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036833500501504514, + "loss": 2.8859, + "theoretical_loss": 3.6889394268463587, + "tokens_seen": 893458432 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003683249749247743, + "loss": 3.0093, + "theoretical_loss": 3.6889131382676057, + "tokens_seen": 893523968 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003683149448345035, + "loss": 3.0753, + "theoretical_loss": 3.6888868521567693, + "tokens_seen": 893589504 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036830491474423274, + "loss": 2.9202, + "theoretical_loss": 3.6888605685134364, + "tokens_seen": 893655040 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036829488465396187, + "loss": 2.9661, + "theoretical_loss": 3.688834287337194, + "tokens_seen": 893720576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003682848545636911, + "loss": 2.8499, + "theoretical_loss": 3.688808008627631, + "tokens_seen": 893786112 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036827482447342023, + "loss": 2.91, + "theoretical_loss": 3.6887817323843333, + "tokens_seen": 893851648 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036826479438314947, + "loss": 3.0142, + "theoretical_loss": 3.6887554586068902, + "tokens_seen": 893917184 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036825476429287865, + "loss": 2.8641, + "theoretical_loss": 3.6887291872948884, + "tokens_seen": 893982720 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036824473420260783, + "loss": 3.0247, + "theoretical_loss": 3.688702918447917, + "tokens_seen": 894048256 + }, + { + "epoch": 2.06, + "learning_rate": 0.000368234704112337, + "loss": 2.9232, + "theoretical_loss": 3.688676652065563, + "tokens_seen": 894113792 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036822467402206625, + "loss": 2.8796, + "theoretical_loss": 3.688650388147415, + "tokens_seen": 894179328 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036821464393179537, + "loss": 2.8787, + "theoretical_loss": 3.6886241266930613, + "tokens_seen": 894244864 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003682046138415246, + "loss": 3.0353, + "theoretical_loss": 3.6885978677020894, + "tokens_seen": 894310400 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036819458375125373, + "loss": 2.7989, + "theoretical_loss": 3.688571611174089, + "tokens_seen": 894375936 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036818455366098297, + "loss": 3.0285, + "theoretical_loss": 3.688545357108648, + "tokens_seen": 894441472 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036817452357071215, + "loss": 2.8879, + "theoretical_loss": 3.688519105505355, + "tokens_seen": 894507008 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1441363, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6616625785827637, + "objective/train/theoretical_loss": 3.6885059806268856, + "objective/train/tokens_used": 914999776, + "theoretical_loss": 3.6885059806268856, + "tokens_seen": 894539776 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036816449348044133, + "loss": 2.897, + "theoretical_loss": 3.688492856363799, + "tokens_seen": 894572544 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003681544633901705, + "loss": 3.0042, + "theoretical_loss": 3.6884666096835685, + "tokens_seen": 894638080 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003681444332998997, + "loss": 2.8301, + "theoretical_loss": 3.6884403654642526, + "tokens_seen": 894703616 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003681344032096289, + "loss": 2.9626, + "theoretical_loss": 3.6884141237054404, + "tokens_seen": 894769152 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003681243731193581, + "loss": 2.9735, + "theoretical_loss": 3.688387884406721, + "tokens_seen": 894834688 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036811434302908724, + "loss": 2.8775, + "theoretical_loss": 3.6883616475676835, + "tokens_seen": 894900224 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003681043129388165, + "loss": 3.0536, + "theoretical_loss": 3.688335413187917, + "tokens_seen": 894965760 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003680942828485456, + "loss": 2.9854, + "theoretical_loss": 3.6883091812670115, + "tokens_seen": 895031296 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036808425275827484, + "loss": 3.0162, + "theoretical_loss": 3.6882829518045566, + "tokens_seen": 895096832 + }, + { + "epoch": 2.06, + "learning_rate": 0.000368074222668004, + "loss": 3.0048, + "theoretical_loss": 3.6882567248001408, + "tokens_seen": 895162368 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003680641925777332, + "loss": 2.7608, + "theoretical_loss": 3.688230500253355, + "tokens_seen": 895227904 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003680541624874624, + "loss": 3.0016, + "theoretical_loss": 3.6882042781637887, + "tokens_seen": 895293440 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003680441323971916, + "loss": 3.0512, + "theoretical_loss": 3.6881780585310313, + "tokens_seen": 895358976 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036803410230692074, + "loss": 3.0124, + "theoretical_loss": 3.6881518413546734, + "tokens_seen": 895424512 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036802407221665, + "loss": 2.9888, + "theoretical_loss": 3.6881256266343057, + "tokens_seen": 895490048 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003680140421263791, + "loss": 2.9215, + "theoretical_loss": 3.6880994143695167, + "tokens_seen": 895555584 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036800401203610834, + "loss": 2.8077, + "theoretical_loss": 3.688073204559898, + "tokens_seen": 895621120 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679939819458375, + "loss": 2.9941, + "theoretical_loss": 3.6880469972050394, + "tokens_seen": 895686656 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679839518555667, + "loss": 2.9698, + "theoretical_loss": 3.688020792304532, + "tokens_seen": 895752192 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679739217652959, + "loss": 2.8921, + "theoretical_loss": 3.687994589857966, + "tokens_seen": 895817728 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036796389167502506, + "loss": 2.9957, + "theoretical_loss": 3.687968389864932, + "tokens_seen": 895883264 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036795386158475424, + "loss": 2.9378, + "theoretical_loss": 3.6879421923250213, + "tokens_seen": 895948800 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679438314944835, + "loss": 2.8858, + "theoretical_loss": 3.687915997237824, + "tokens_seen": 896014336 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679338014042126, + "loss": 2.843, + "theoretical_loss": 3.687889804602932, + "tokens_seen": 896079872 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036792377131394184, + "loss": 2.9208, + "theoretical_loss": 3.6878636144199355, + "tokens_seen": 896145408 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1444166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5817642211914062, + "objective/train/theoretical_loss": 3.6878505202477707, + "objective/train/tokens_used": 916638176, + "theoretical_loss": 3.6878505202477707, + "tokens_seen": 896178176 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036791374122367097, + "loss": 2.8896, + "theoretical_loss": 3.687837426688427, + "tokens_seen": 896210944 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003679037111334002, + "loss": 2.9395, + "theoretical_loss": 3.6878112414079958, + "tokens_seen": 896276480 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003678936810431294, + "loss": 2.9768, + "theoretical_loss": 3.6877850585782355, + "tokens_seen": 896342016 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036788365095285857, + "loss": 2.9141, + "theoretical_loss": 3.687758878198736, + "tokens_seen": 896407552 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036787362086258775, + "loss": 2.879, + "theoretical_loss": 3.6877327002690894, + "tokens_seen": 896473088 + }, + { + "epoch": 2.06, + "learning_rate": 0.000367863590772317, + "loss": 3.0004, + "theoretical_loss": 3.6877065247888874, + "tokens_seen": 896538624 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036785356068204616, + "loss": 2.9843, + "theoretical_loss": 3.687680351757722, + "tokens_seen": 896604160 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036784353059177535, + "loss": 2.981, + "theoretical_loss": 3.687654181175185, + "tokens_seen": 896669696 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003678335005015045, + "loss": 2.8728, + "theoretical_loss": 3.687628013040868, + "tokens_seen": 896735232 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003678234704112337, + "loss": 2.9269, + "theoretical_loss": 3.6876018473543635, + "tokens_seen": 896800768 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036781344032096294, + "loss": 2.8963, + "theoretical_loss": 3.687575684115263, + "tokens_seen": 896866304 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036780341023069207, + "loss": 3.0322, + "theoretical_loss": 3.6875495233231597, + "tokens_seen": 896931840 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003677933801404213, + "loss": 2.8531, + "theoretical_loss": 3.687523364977645, + "tokens_seen": 896997376 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036778335005015043, + "loss": 2.9334, + "theoretical_loss": 3.6874972090783125, + "tokens_seen": 897062912 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036777331995987967, + "loss": 3.0906, + "theoretical_loss": 3.687471055624754, + "tokens_seen": 897128448 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036776328986960885, + "loss": 2.8506, + "theoretical_loss": 3.6874449046165623, + "tokens_seen": 897193984 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036775325977933803, + "loss": 2.7673, + "theoretical_loss": 3.6874187560533302, + "tokens_seen": 897259520 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003677432296890672, + "loss": 3.0224, + "theoretical_loss": 3.6873926099346503, + "tokens_seen": 897325056 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036773319959879645, + "loss": 2.837, + "theoretical_loss": 3.6873664662601158, + "tokens_seen": 897390592 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036772316950852557, + "loss": 2.8448, + "theoretical_loss": 3.6873403250293197, + "tokens_seen": 897456128 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003677131394182548, + "loss": 2.8551, + "theoretical_loss": 3.6873141862418555, + "tokens_seen": 897521664 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036770310932798393, + "loss": 2.8662, + "theoretical_loss": 3.687288049897316, + "tokens_seen": 897587200 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036769307923771317, + "loss": 2.9606, + "theoretical_loss": 3.6872619159952946, + "tokens_seen": 897652736 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036768304914744235, + "loss": 2.9799, + "theoretical_loss": 3.687235784535385, + "tokens_seen": 897718272 + }, + { + "epoch": 2.06, + "learning_rate": 0.00036767301905717153, + "loss": 2.9949, + "theoretical_loss": 3.6872096555171803, + "tokens_seen": 897783808 + }, + { + "epoch": 2.06, + "objective/train/docs_used": 1447083, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0007472038269043, + "objective/train/theoretical_loss": 3.6871965919235903, + "objective/train/tokens_used": 918276576, + "theoretical_loss": 3.6871965919235903, + "tokens_seen": 897816576 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003676629889669007, + "loss": 2.9133, + "theoretical_loss": 3.6871835289402743, + "tokens_seen": 897849344 + }, + { + "epoch": 2.06, + "learning_rate": 0.0003676529588766299, + "loss": 2.92, + "theoretical_loss": 3.6871574048042612, + "tokens_seen": 897914880 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003676429287863591, + "loss": 3.0691, + "theoretical_loss": 3.687131283108734, + "tokens_seen": 897980416 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003676328986960883, + "loss": 2.9901, + "theoretical_loss": 3.6871051638532872, + "tokens_seen": 898045952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036762286860581744, + "loss": 2.8652, + "theoretical_loss": 3.687079047037515, + "tokens_seen": 898111488 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003676128385155467, + "loss": 3.073, + "theoretical_loss": 3.687052932661011, + "tokens_seen": 898177024 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003676028084252758, + "loss": 2.8922, + "theoretical_loss": 3.6870268207233696, + "tokens_seen": 898242560 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036759277833500504, + "loss": 2.8879, + "theoretical_loss": 3.687000711224185, + "tokens_seen": 898308096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675827482447342, + "loss": 3.0426, + "theoretical_loss": 3.6869746041630522, + "tokens_seen": 898373632 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675727181544634, + "loss": 2.8763, + "theoretical_loss": 3.6869484995395654, + "tokens_seen": 898439168 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675626880641926, + "loss": 2.806, + "theoretical_loss": 3.6869223973533183, + "tokens_seen": 898504704 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675526579739218, + "loss": 2.9815, + "theoretical_loss": 3.686896297603907, + "tokens_seen": 898570240 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036754262788365094, + "loss": 2.9997, + "theoretical_loss": 3.686870200290925, + "tokens_seen": 898635776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675325977933802, + "loss": 2.9543, + "theoretical_loss": 3.686844105413969, + "tokens_seen": 898701312 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675225677031093, + "loss": 3.0299, + "theoretical_loss": 3.686818012972632, + "tokens_seen": 898766848 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036751253761283854, + "loss": 3.0409, + "theoretical_loss": 3.6867919229665103, + "tokens_seen": 898832384 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003675025075225677, + "loss": 3.0228, + "theoretical_loss": 3.6867658353951986, + "tokens_seen": 898897920 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674924774322969, + "loss": 2.8278, + "theoretical_loss": 3.686739750258292, + "tokens_seen": 898963456 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674824473420261, + "loss": 2.9975, + "theoretical_loss": 3.686713667555387, + "tokens_seen": 899028992 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036747241725175526, + "loss": 2.8245, + "theoretical_loss": 3.686687587286078, + "tokens_seen": 899094528 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036746238716148444, + "loss": 2.8684, + "theoretical_loss": 3.686661509449961, + "tokens_seen": 899160064 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674523570712137, + "loss": 2.8357, + "theoretical_loss": 3.686635434046631, + "tokens_seen": 899225600 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674423269809428, + "loss": 3.0147, + "theoretical_loss": 3.6866093610756847, + "tokens_seen": 899291136 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036743229689067204, + "loss": 2.9731, + "theoretical_loss": 3.686583290536718, + "tokens_seen": 899356672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036742226680040117, + "loss": 3.1408, + "theoretical_loss": 3.6865572224293257, + "tokens_seen": 899422208 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1449594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6759262084960938, + "objective/train/theoretical_loss": 3.686544189287344, + "objective/train/tokens_used": 919914976, + "theoretical_loss": 3.686544189287344, + "tokens_seen": 899454976 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674122367101304, + "loss": 2.9678, + "theoretical_loss": 3.686531156753105, + "tokens_seen": 899487744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003674022066198596, + "loss": 2.977, + "theoretical_loss": 3.6865050935076518, + "tokens_seen": 899553280 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036739217652958877, + "loss": 3.0031, + "theoretical_loss": 3.686479032692562, + "tokens_seen": 899618816 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036738214643931795, + "loss": 2.9433, + "theoretical_loss": 3.686452974307432, + "tokens_seen": 899684352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003673721163490472, + "loss": 3.02, + "theoretical_loss": 3.6864269183518585, + "tokens_seen": 899749888 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003673620862587763, + "loss": 2.9018, + "theoretical_loss": 3.686400864825438, + "tokens_seen": 899815424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036735205616850555, + "loss": 2.9609, + "theoretical_loss": 3.6863748137277668, + "tokens_seen": 899880960 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036734202607823467, + "loss": 2.8166, + "theoretical_loss": 3.6863487650584426, + "tokens_seen": 899946496 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003673319959879639, + "loss": 2.8263, + "theoretical_loss": 3.686322718817061, + "tokens_seen": 900012032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003673219658976931, + "loss": 3.0952, + "theoretical_loss": 3.6862966750032196, + "tokens_seen": 900077568 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036731193580742227, + "loss": 2.9484, + "theoretical_loss": 3.686270633616515, + "tokens_seen": 900143104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036730190571715145, + "loss": 2.9581, + "theoretical_loss": 3.6862445946565456, + "tokens_seen": 900208640 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036729187562688063, + "loss": 3.0291, + "theoretical_loss": 3.6862185581229063, + "tokens_seen": 900274176 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003672818455366098, + "loss": 3.0407, + "theoretical_loss": 3.6861925240151967, + "tokens_seen": 900339712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036727181544633905, + "loss": 2.8826, + "theoretical_loss": 3.6861664923330126, + "tokens_seen": 900405248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003672617853560682, + "loss": 2.8718, + "theoretical_loss": 3.6861404630759527, + "tokens_seen": 900470784 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003672517552657974, + "loss": 2.8959, + "theoretical_loss": 3.686114436243614, + "tokens_seen": 900536320 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036724172517552654, + "loss": 2.9288, + "theoretical_loss": 3.686088411835594, + "tokens_seen": 900601856 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036723169508525577, + "loss": 2.8574, + "theoretical_loss": 3.686062389851491, + "tokens_seen": 900667392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036722166499498495, + "loss": 3.0318, + "theoretical_loss": 3.686036370290902, + "tokens_seen": 900732928 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036721163490471414, + "loss": 2.956, + "theoretical_loss": 3.686010353153426, + "tokens_seen": 900798464 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003672016048144433, + "loss": 2.9667, + "theoretical_loss": 3.685984338438661, + "tokens_seen": 900864000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036719157472417255, + "loss": 3.0791, + "theoretical_loss": 3.6859583261462046, + "tokens_seen": 900929536 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003671815446339017, + "loss": 3.0175, + "theoretical_loss": 3.685932316275655, + "tokens_seen": 900995072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003671715145436309, + "loss": 2.9817, + "theoretical_loss": 3.6859063088266115, + "tokens_seen": 901060608 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1452400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.054413080215454, + "objective/train/theoretical_loss": 3.6858933060100285, + "objective/train/tokens_used": 921553376, + "theoretical_loss": 3.6858933060100285, + "tokens_seen": 901093376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036716148445336004, + "loss": 2.9177, + "theoretical_loss": 3.6858803037986716, + "tokens_seen": 901126144 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003671514543630893, + "loss": 2.8906, + "theoretical_loss": 3.685854301191434, + "tokens_seen": 901191680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036714142427281846, + "loss": 2.9235, + "theoretical_loss": 3.6858283010044977, + "tokens_seen": 901257216 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036713139418254764, + "loss": 2.9882, + "theoretical_loss": 3.6858023032374616, + "tokens_seen": 901322752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003671213640922768, + "loss": 3.0956, + "theoretical_loss": 3.685776307889924, + "tokens_seen": 901388288 + }, + { + "epoch": 2.07, + "learning_rate": 0.000367111334002006, + "loss": 2.8669, + "theoretical_loss": 3.6857503149614845, + "tokens_seen": 901453824 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036710130391173524, + "loss": 2.8969, + "theoretical_loss": 3.6857243244517415, + "tokens_seen": 901519360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670912738214644, + "loss": 2.9091, + "theoretical_loss": 3.6856983363602946, + "tokens_seen": 901584896 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670812437311936, + "loss": 2.9848, + "theoretical_loss": 3.685672350686742, + "tokens_seen": 901650432 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670712136409228, + "loss": 3.083, + "theoretical_loss": 3.6856463674306847, + "tokens_seen": 901715968 + }, + { + "epoch": 2.07, + "learning_rate": 0.000367061183550652, + "loss": 2.9544, + "theoretical_loss": 3.6856203865917205, + "tokens_seen": 901781504 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036705115346038114, + "loss": 2.8423, + "theoretical_loss": 3.6855944081694503, + "tokens_seen": 901847040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670411233701104, + "loss": 2.8814, + "theoretical_loss": 3.685568432163473, + "tokens_seen": 901912576 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670310932798395, + "loss": 2.8641, + "theoretical_loss": 3.685542458573388, + "tokens_seen": 901978112 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036702106318956874, + "loss": 2.9645, + "theoretical_loss": 3.685516487398796, + "tokens_seen": 902043648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670110330992979, + "loss": 2.808, + "theoretical_loss": 3.6854905186392957, + "tokens_seen": 902109184 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003670010030090271, + "loss": 2.8911, + "theoretical_loss": 3.685464552294488, + "tokens_seen": 902174720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003669909729187563, + "loss": 2.838, + "theoretical_loss": 3.685438588363973, + "tokens_seen": 902240256 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036698094282848546, + "loss": 3.0198, + "theoretical_loss": 3.685412626847351, + "tokens_seen": 902305792 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036697091273821464, + "loss": 2.8689, + "theoretical_loss": 3.685386667744221, + "tokens_seen": 902371328 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003669608826479439, + "loss": 2.9148, + "theoretical_loss": 3.685360711054185, + "tokens_seen": 902436864 + }, + { + "epoch": 2.07, + "learning_rate": 0.000366950852557673, + "loss": 2.8411, + "theoretical_loss": 3.685334756776842, + "tokens_seen": 902502400 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036694082246740224, + "loss": 2.8554, + "theoretical_loss": 3.685308804911794, + "tokens_seen": 902567936 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036693079237713137, + "loss": 3.0239, + "theoretical_loss": 3.6852828554586408, + "tokens_seen": 902633472 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003669207622868606, + "loss": 2.859, + "theoretical_loss": 3.6852569084169833, + "tokens_seen": 902699008 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1455166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9749364852905273, + "objective/train/theoretical_loss": 3.685243935800341, + "objective/train/tokens_used": 923191776, + "theoretical_loss": 3.685243935800341, + "tokens_seen": 902731776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003669107321965898, + "loss": 2.8923, + "theoretical_loss": 3.6852309637864225, + "tokens_seen": 902764544 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036690070210631897, + "loss": 2.9357, + "theoretical_loss": 3.685205021566559, + "tokens_seen": 902830080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036689067201604815, + "loss": 3.009, + "theoretical_loss": 3.685179081756994, + "tokens_seen": 902895616 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003668806419257774, + "loss": 2.9613, + "theoretical_loss": 3.6851531443573293, + "tokens_seen": 902961152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003668706118355065, + "loss": 2.9142, + "theoretical_loss": 3.685127209367165, + "tokens_seen": 903026688 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036686058174523575, + "loss": 3.0671, + "theoretical_loss": 3.685101276786103, + "tokens_seen": 903092224 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036685055165496487, + "loss": 2.9588, + "theoretical_loss": 3.6850753466137443, + "tokens_seen": 903157760 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003668405215646941, + "loss": 2.8365, + "theoretical_loss": 3.685049418849691, + "tokens_seen": 903223296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003668304914744233, + "loss": 2.9717, + "theoretical_loss": 3.685023493493545, + "tokens_seen": 903288832 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036682046138415247, + "loss": 2.9838, + "theoretical_loss": 3.6849975705449065, + "tokens_seen": 903354368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036681043129388165, + "loss": 2.8679, + "theoretical_loss": 3.684971650003379, + "tokens_seen": 903419904 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036680040120361083, + "loss": 2.9051, + "theoretical_loss": 3.684945731868564, + "tokens_seen": 903485440 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036679037111334, + "loss": 2.975, + "theoretical_loss": 3.6849198161400625, + "tokens_seen": 903550976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036678034102306925, + "loss": 2.9057, + "theoretical_loss": 3.6848939028174774, + "tokens_seen": 903616512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003667703109327984, + "loss": 2.9337, + "theoretical_loss": 3.6848679919004104, + "tokens_seen": 903682048 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003667602808425276, + "loss": 3.0229, + "theoretical_loss": 3.6848420833884648, + "tokens_seen": 903747584 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036675025075225674, + "loss": 2.9181, + "theoretical_loss": 3.684816177281242, + "tokens_seen": 903813120 + }, + { + "epoch": 2.07, + "learning_rate": 0.000366740220661986, + "loss": 2.9844, + "theoretical_loss": 3.684790273578344, + "tokens_seen": 903878656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036673019057171515, + "loss": 2.8657, + "theoretical_loss": 3.684764372279375, + "tokens_seen": 903944192 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036672016048144434, + "loss": 2.87, + "theoretical_loss": 3.684738473383936, + "tokens_seen": 904009728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003667101303911735, + "loss": 2.9334, + "theoretical_loss": 3.6847125768916307, + "tokens_seen": 904075264 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036670010030090275, + "loss": 2.93, + "theoretical_loss": 3.684686682802062, + "tokens_seen": 904140800 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003666900702106319, + "loss": 2.9344, + "theoretical_loss": 3.6846607911148324, + "tokens_seen": 904206336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003666800401203611, + "loss": 2.8665, + "theoretical_loss": 3.6846349018295443, + "tokens_seen": 904271872 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036667001003009024, + "loss": 2.9946, + "theoretical_loss": 3.684609014945803, + "tokens_seen": 904337408 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1457960, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8243372440338135, + "objective/train/theoretical_loss": 3.6845960724043874, + "objective/train/tokens_used": 924830176, + "theoretical_loss": 3.6845960724043874, + "tokens_seen": 904370176 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003666599799398195, + "loss": 2.9837, + "theoretical_loss": 3.684583130463209, + "tokens_seen": 904402944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036664994984954866, + "loss": 3.0543, + "theoretical_loss": 3.6845572483813678, + "tokens_seen": 904468480 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036663991975927784, + "loss": 2.8552, + "theoretical_loss": 3.684531368699881, + "tokens_seen": 904534016 + }, + { + "epoch": 2.07, + "learning_rate": 0.000366629889669007, + "loss": 3.0884, + "theoretical_loss": 3.6845054914183537, + "tokens_seen": 904599552 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003666198595787362, + "loss": 3.0131, + "theoretical_loss": 3.6844796165363887, + "tokens_seen": 904665088 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003666098294884654, + "loss": 2.8026, + "theoretical_loss": 3.68445374405359, + "tokens_seen": 904730624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003665997993981946, + "loss": 2.9564, + "theoretical_loss": 3.6844278739695606, + "tokens_seen": 904796160 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036658976930792374, + "loss": 2.8981, + "theoretical_loss": 3.6844020062839054, + "tokens_seen": 904861696 + }, + { + "epoch": 2.07, + "learning_rate": 0.000366579739217653, + "loss": 2.7729, + "theoretical_loss": 3.6843761409962275, + "tokens_seen": 904927232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036656970912738216, + "loss": 2.9695, + "theoretical_loss": 3.6843502781061312, + "tokens_seen": 904992768 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036655967903711134, + "loss": 3.0072, + "theoretical_loss": 3.6843244176132215, + "tokens_seen": 905058304 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003665496489468405, + "loss": 2.8839, + "theoretical_loss": 3.684298559517102, + "tokens_seen": 905123840 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003665396188565697, + "loss": 2.843, + "theoretical_loss": 3.6842727038173764, + "tokens_seen": 905189376 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003665295887662989, + "loss": 2.9756, + "theoretical_loss": 3.68424685051365, + "tokens_seen": 905254912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003665195586760281, + "loss": 3.0202, + "theoretical_loss": 3.684220999605527, + "tokens_seen": 905320448 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036650952858575725, + "loss": 2.8906, + "theoretical_loss": 3.684195151092612, + "tokens_seen": 905385984 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003664994984954865, + "loss": 2.9777, + "theoretical_loss": 3.6841693049745103, + "tokens_seen": 905451520 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003664894684052156, + "loss": 2.7498, + "theoretical_loss": 3.684143461250826, + "tokens_seen": 905517056 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036647943831494484, + "loss": 3.0735, + "theoretical_loss": 3.6841176199211643, + "tokens_seen": 905582592 + }, + { + "epoch": 2.07, + "learning_rate": 0.000366469408224674, + "loss": 2.908, + "theoretical_loss": 3.6840917809851303, + "tokens_seen": 905648128 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003664593781344032, + "loss": 2.7609, + "theoretical_loss": 3.6840659444423287, + "tokens_seen": 905713664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003664493480441324, + "loss": 2.8717, + "theoretical_loss": 3.684040110292365, + "tokens_seen": 905779200 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036643931795386157, + "loss": 2.7702, + "theoretical_loss": 3.6840142785348444, + "tokens_seen": 905844736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036642928786359075, + "loss": 2.8581, + "theoretical_loss": 3.6839884491693726, + "tokens_seen": 905910272 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036641925777332, + "loss": 3.1506, + "theoretical_loss": 3.683962622195555, + "tokens_seen": 905975808 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1459482, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9163169860839844, + "objective/train/theoretical_loss": 3.6839497096053924, + "objective/train/tokens_used": 926468576, + "theoretical_loss": 3.6839497096053924, + "tokens_seen": 906008576 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003664092276830491, + "loss": 2.8397, + "theoretical_loss": 3.683936797612996, + "tokens_seen": 906041344 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036639919759277835, + "loss": 2.9263, + "theoretical_loss": 3.683910975421303, + "tokens_seen": 906106880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036638916750250753, + "loss": 3.0034, + "theoretical_loss": 3.6838851556200805, + "tokens_seen": 906172416 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003663791374122367, + "loss": 2.781, + "theoretical_loss": 3.683859338208935, + "tokens_seen": 906237952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003663691073219659, + "loss": 2.9388, + "theoretical_loss": 3.683833523187472, + "tokens_seen": 906303488 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036635907723169507, + "loss": 2.8998, + "theoretical_loss": 3.683807710555298, + "tokens_seen": 906369024 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003663490471414243, + "loss": 2.8898, + "theoretical_loss": 3.683781900312019, + "tokens_seen": 906434560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003663390170511535, + "loss": 2.8106, + "theoretical_loss": 3.683756092457241, + "tokens_seen": 906500096 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036632898696088267, + "loss": 2.9282, + "theoretical_loss": 3.6837302869905706, + "tokens_seen": 906565632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036631895687061185, + "loss": 2.9207, + "theoretical_loss": 3.6837044839116135, + "tokens_seen": 906631168 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036630892678034103, + "loss": 2.8658, + "theoretical_loss": 3.683678683219977, + "tokens_seen": 906696704 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003662988966900702, + "loss": 2.8477, + "theoretical_loss": 3.6836528849152677, + "tokens_seen": 906762240 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036628886659979945, + "loss": 2.9472, + "theoretical_loss": 3.683627088997092, + "tokens_seen": 906827776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003662788365095286, + "loss": 2.9035, + "theoretical_loss": 3.6836012954650563, + "tokens_seen": 906893312 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003662688064192578, + "loss": 2.978, + "theoretical_loss": 3.683575504318768, + "tokens_seen": 906958848 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036625877632898694, + "loss": 2.8608, + "theoretical_loss": 3.6835497155578336, + "tokens_seen": 907024384 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003662487462387162, + "loss": 2.8776, + "theoretical_loss": 3.6835239291818613, + "tokens_seen": 907089920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036623871614844535, + "loss": 2.7582, + "theoretical_loss": 3.6834981451904563, + "tokens_seen": 907155456 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036622868605817454, + "loss": 2.9364, + "theoretical_loss": 3.6834723635832276, + "tokens_seen": 907220992 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003662186559679037, + "loss": 2.9634, + "theoretical_loss": 3.6834465843597823, + "tokens_seen": 907286528 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036620862587763295, + "loss": 2.9901, + "theoretical_loss": 3.683420807519727, + "tokens_seen": 907352064 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661985957873621, + "loss": 2.7731, + "theoretical_loss": 3.68339503306267, + "tokens_seen": 907417600 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661885656970913, + "loss": 2.8628, + "theoretical_loss": 3.683369260988218, + "tokens_seen": 907483136 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036617853560682044, + "loss": 2.9374, + "theoretical_loss": 3.6833434912959797, + "tokens_seen": 907548672 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661685055165497, + "loss": 2.9292, + "theoretical_loss": 3.683317723985562, + "tokens_seen": 907614208 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1461872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.132350206375122, + "objective/train/theoretical_loss": 3.6833048412234137, + "objective/train/tokens_used": 928106976, + "theoretical_loss": 3.6833048412234137, + "tokens_seen": 907646976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036615847542627886, + "loss": 3.0494, + "theoretical_loss": 3.683291959056574, + "tokens_seen": 907679744 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036614844533600804, + "loss": 2.9493, + "theoretical_loss": 3.6832661965086224, + "tokens_seen": 907745280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661384152457372, + "loss": 3.0026, + "theoretical_loss": 3.683240436341316, + "tokens_seen": 907810816 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661283851554664, + "loss": 2.9413, + "theoretical_loss": 3.683214678554263, + "tokens_seen": 907876352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661183550651956, + "loss": 2.7505, + "theoretical_loss": 3.683188923147071, + "tokens_seen": 907941888 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003661083249749248, + "loss": 2.8155, + "theoretical_loss": 3.6831631701193492, + "tokens_seen": 908007424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036609829488465394, + "loss": 2.9091, + "theoretical_loss": 3.683137419470706, + "tokens_seen": 908072960 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660882647943832, + "loss": 2.9811, + "theoretical_loss": 3.6831116712007494, + "tokens_seen": 908138496 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036607823470411236, + "loss": 2.8664, + "theoretical_loss": 3.683085925309088, + "tokens_seen": 908204032 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036606820461384154, + "loss": 2.8192, + "theoretical_loss": 3.683060181795331, + "tokens_seen": 908269568 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660581745235707, + "loss": 2.8768, + "theoretical_loss": 3.683034440659087, + "tokens_seen": 908335104 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660481444332999, + "loss": 2.8723, + "theoretical_loss": 3.6830087018999653, + "tokens_seen": 908400640 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660381143430291, + "loss": 2.8825, + "theoretical_loss": 3.6829829655175743, + "tokens_seen": 908466176 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660280842527583, + "loss": 2.9872, + "theoretical_loss": 3.6829572315115238, + "tokens_seen": 908531712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036601805416248745, + "loss": 3.0428, + "theoretical_loss": 3.6829314998814215, + "tokens_seen": 908597248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003660080240722167, + "loss": 2.822, + "theoretical_loss": 3.6829057706268786, + "tokens_seen": 908662784 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659979939819458, + "loss": 2.8965, + "theoretical_loss": 3.6828800437475033, + "tokens_seen": 908728320 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036598796389167504, + "loss": 2.9974, + "theoretical_loss": 3.6828543192429057, + "tokens_seen": 908793856 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659779338014042, + "loss": 2.8456, + "theoretical_loss": 3.6828285971126946, + "tokens_seen": 908859392 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659679037111334, + "loss": 2.9067, + "theoretical_loss": 3.6828028773564805, + "tokens_seen": 908924928 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659578736208626, + "loss": 2.9304, + "theoretical_loss": 3.682777159973872, + "tokens_seen": 908990464 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036594784353059177, + "loss": 2.7673, + "theoretical_loss": 3.6827514449644805, + "tokens_seen": 909056000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036593781344032095, + "loss": 2.8935, + "theoretical_loss": 3.682725732327915, + "tokens_seen": 909121536 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659277833500502, + "loss": 2.8319, + "theoretical_loss": 3.682700022063785, + "tokens_seen": 909187072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003659177532597793, + "loss": 3.0231, + "theoretical_loss": 3.6826743141717015, + "tokens_seen": 909252608 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1464834, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.070591449737549, + "objective/train/theoretical_loss": 3.682661461115056, + "objective/train/tokens_used": 929745376, + "theoretical_loss": 3.682661461115056, + "tokens_seen": 909285376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036590772316950855, + "loss": 3.0275, + "theoretical_loss": 3.6826486086512746, + "tokens_seen": 909318144 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036589769307923773, + "loss": 2.8345, + "theoretical_loss": 3.6826229055021145, + "tokens_seen": 909383680 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658876629889669, + "loss": 2.7689, + "theoretical_loss": 3.682597204723832, + "tokens_seen": 909449216 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658776328986961, + "loss": 2.8942, + "theoretical_loss": 3.6825715063160365, + "tokens_seen": 909514752 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036586760280842527, + "loss": 2.86, + "theoretical_loss": 3.6825458102783397, + "tokens_seen": 909580288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036585757271815445, + "loss": 2.8697, + "theoretical_loss": 3.682520116610351, + "tokens_seen": 909645824 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658475426278837, + "loss": 2.8862, + "theoretical_loss": 3.6824944253116825, + "tokens_seen": 909711360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658375125376128, + "loss": 2.9812, + "theoretical_loss": 3.6824687363819444, + "tokens_seen": 909776896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036582748244734205, + "loss": 2.9608, + "theoretical_loss": 3.6824430498207477, + "tokens_seen": 909842432 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658174523570712, + "loss": 2.7428, + "theoretical_loss": 3.6824173656277033, + "tokens_seen": 909907968 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003658074222668004, + "loss": 2.9572, + "theoretical_loss": 3.6823916838024227, + "tokens_seen": 909973504 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657973921765296, + "loss": 3.0045, + "theoretical_loss": 3.6823660043445168, + "tokens_seen": 910039040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657873620862588, + "loss": 3.0013, + "theoretical_loss": 3.6823403272535975, + "tokens_seen": 910104576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036577733199598796, + "loss": 2.8301, + "theoretical_loss": 3.682314652529275, + "tokens_seen": 910170112 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036576730190571714, + "loss": 2.9384, + "theoretical_loss": 3.682288980171162, + "tokens_seen": 910235648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657572718154463, + "loss": 2.8955, + "theoretical_loss": 3.6822633101788695, + "tokens_seen": 910301184 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036574724172517555, + "loss": 2.9398, + "theoretical_loss": 3.6822376425520096, + "tokens_seen": 910366720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657372116349047, + "loss": 3.057, + "theoretical_loss": 3.6822119772901933, + "tokens_seen": 910432256 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657271815446339, + "loss": 2.9088, + "theoretical_loss": 3.682186314393033, + "tokens_seen": 910497792 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657171514543631, + "loss": 2.7174, + "theoretical_loss": 3.682160653860141, + "tokens_seen": 910563328 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003657071213640923, + "loss": 2.7959, + "theoretical_loss": 3.6821349956911282, + "tokens_seen": 910628864 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036569709127382146, + "loss": 3.0003, + "theoretical_loss": 3.6821093398856077, + "tokens_seen": 910694400 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036568706118355064, + "loss": 2.9999, + "theoretical_loss": 3.6820836864431916, + "tokens_seen": 910759936 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003656770310932798, + "loss": 2.9676, + "theoretical_loss": 3.682058035363492, + "tokens_seen": 910825472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036566700100300906, + "loss": 2.7685, + "theoretical_loss": 3.6820323866461218, + "tokens_seen": 910891008 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1467634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.943638324737549, + "objective/train/theoretical_loss": 3.6820195631731885, + "objective/train/tokens_used": 931383776, + "theoretical_loss": 3.6820195631731885, + "tokens_seen": 910923776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003656569709127382, + "loss": 2.8683, + "theoretical_loss": 3.6820067402906926, + "tokens_seen": 910956544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003656469408224674, + "loss": 2.9663, + "theoretical_loss": 3.681981096296818, + "tokens_seen": 911022080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036563691073219655, + "loss": 3.0822, + "theoretical_loss": 3.68195545466411, + "tokens_seen": 911087616 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003656268806419258, + "loss": 2.8379, + "theoretical_loss": 3.6819298153921816, + "tokens_seen": 911153152 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036561685055165496, + "loss": 2.8384, + "theoretical_loss": 3.6819041784806457, + "tokens_seen": 911218688 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036560682046138414, + "loss": 2.8514, + "theoretical_loss": 3.681878543929115, + "tokens_seen": 911284224 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655967903711134, + "loss": 2.9418, + "theoretical_loss": 3.6818529117372023, + "tokens_seen": 911349760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036558676028084256, + "loss": 2.6995, + "theoretical_loss": 3.681827281904522, + "tokens_seen": 911415296 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036557673019057174, + "loss": 2.8794, + "theoretical_loss": 3.6818016544306866, + "tokens_seen": 911480832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655667001003009, + "loss": 3.0418, + "theoretical_loss": 3.6817760293153086, + "tokens_seen": 911546368 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655566700100301, + "loss": 2.949, + "theoretical_loss": 3.6817504065580025, + "tokens_seen": 911611904 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655466399197593, + "loss": 3.1791, + "theoretical_loss": 3.681724786158382, + "tokens_seen": 911677440 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655366098294885, + "loss": 2.8874, + "theoretical_loss": 3.68169916811606, + "tokens_seen": 911742976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036552657973921765, + "loss": 2.8066, + "theoretical_loss": 3.6816735524306496, + "tokens_seen": 911808512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003655165496489469, + "loss": 2.9834, + "theoretical_loss": 3.681647939101766, + "tokens_seen": 911874048 + }, + { + "epoch": 2.07, + "learning_rate": 0.000365506519558676, + "loss": 2.7719, + "theoretical_loss": 3.681622328129022, + "tokens_seen": 911939584 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036549648946840525, + "loss": 2.9709, + "theoretical_loss": 3.6815967195120325, + "tokens_seen": 912005120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003654864593781344, + "loss": 2.977, + "theoretical_loss": 3.6815711132504108, + "tokens_seen": 912070656 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003654764292878636, + "loss": 3.0456, + "theoretical_loss": 3.681545509343771, + "tokens_seen": 912136192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003654663991975928, + "loss": 2.967, + "theoretical_loss": 3.6815199077917278, + "tokens_seen": 912201728 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036545636910732197, + "loss": 2.9285, + "theoretical_loss": 3.6814943085938956, + "tokens_seen": 912267264 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036544633901705115, + "loss": 2.9343, + "theoretical_loss": 3.6814687117498885, + "tokens_seen": 912332800 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003654363089267804, + "loss": 2.8961, + "theoretical_loss": 3.6814431172593207, + "tokens_seen": 912398336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003654262788365095, + "loss": 2.8263, + "theoretical_loss": 3.6814175251218075, + "tokens_seen": 912463872 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036541624874623875, + "loss": 2.8482, + "theoretical_loss": 3.681391935336963, + "tokens_seen": 912529408 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1470449, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5865087509155273, + "objective/train/theoretical_loss": 3.6813791413266714, + "objective/train/tokens_used": 933022176, + "theoretical_loss": 3.6813791413266714, + "tokens_seen": 912562176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036540621865596793, + "loss": 2.7644, + "theoretical_loss": 3.6813663479044028, + "tokens_seen": 912594944 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653961885656971, + "loss": 2.8237, + "theoretical_loss": 3.6813407628237407, + "tokens_seen": 912660480 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653861584754263, + "loss": 2.8831, + "theoretical_loss": 3.6813151800945922, + "tokens_seen": 912726016 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036537612838515547, + "loss": 2.8442, + "theoretical_loss": 3.681289599716572, + "tokens_seen": 912791552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036536609829488465, + "loss": 2.9226, + "theoretical_loss": 3.6812640216892962, + "tokens_seen": 912857088 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653560682046139, + "loss": 2.9863, + "theoretical_loss": 3.6812384460123786, + "tokens_seen": 912922624 + }, + { + "epoch": 2.07, + "learning_rate": 0.000365346038114343, + "loss": 2.6628, + "theoretical_loss": 3.6812128726854363, + "tokens_seen": 912988160 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036533600802407225, + "loss": 2.911, + "theoretical_loss": 3.681187301708083, + "tokens_seen": 913053696 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653259779338014, + "loss": 2.9093, + "theoretical_loss": 3.681161733079935, + "tokens_seen": 913119232 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653159478435306, + "loss": 2.896, + "theoretical_loss": 3.681136166800608, + "tokens_seen": 913184768 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003653059177532598, + "loss": 2.9732, + "theoretical_loss": 3.681110602869717, + "tokens_seen": 913250304 + }, + { + "epoch": 2.07, + "learning_rate": 0.000365295887662989, + "loss": 2.8684, + "theoretical_loss": 3.681085041286879, + "tokens_seen": 913315840 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036528585757271816, + "loss": 2.8674, + "theoretical_loss": 3.681059482051709, + "tokens_seen": 913381376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036527582748244734, + "loss": 2.9698, + "theoretical_loss": 3.681033925163823, + "tokens_seen": 913446912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003652657973921765, + "loss": 2.7464, + "theoretical_loss": 3.6810083706228376, + "tokens_seen": 913512448 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036525576730190575, + "loss": 2.8468, + "theoretical_loss": 3.680982818428368, + "tokens_seen": 913577984 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003652457372116349, + "loss": 2.9005, + "theoretical_loss": 3.680957268580031, + "tokens_seen": 913643520 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003652357071213641, + "loss": 2.9321, + "theoretical_loss": 3.680931721077443, + "tokens_seen": 913709056 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003652256770310933, + "loss": 2.9803, + "theoretical_loss": 3.68090617592022, + "tokens_seen": 913774592 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003652156469408225, + "loss": 2.8939, + "theoretical_loss": 3.6808806331079786, + "tokens_seen": 913840128 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036520561685055166, + "loss": 2.9648, + "theoretical_loss": 3.6808550926403365, + "tokens_seen": 913905664 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036519558676028084, + "loss": 2.9464, + "theoretical_loss": 3.6808295545169085, + "tokens_seen": 913971200 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036518555667001, + "loss": 2.9882, + "theoretical_loss": 3.6808040187373123, + "tokens_seen": 914036736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036517552657973926, + "loss": 3.0648, + "theoretical_loss": 3.6807784853011656, + "tokens_seen": 914102272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003651654964894684, + "loss": 2.943, + "theoretical_loss": 3.6807529542080837, + "tokens_seen": 914167808 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1473205, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.845226287841797, + "objective/train/theoretical_loss": 3.6807401895400726, + "objective/train/tokens_used": 934660576, + "theoretical_loss": 3.6807401895400726, + "tokens_seen": 914200576 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003651554663991976, + "loss": 2.8905, + "theoretical_loss": 3.6807274254576843, + "tokens_seen": 914233344 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036514543630892675, + "loss": 3.1939, + "theoretical_loss": 3.680701899049585, + "tokens_seen": 914298880 + }, + { + "epoch": 2.07, + "learning_rate": 0.000365135406218656, + "loss": 2.902, + "theoretical_loss": 3.680676374983403, + "tokens_seen": 914364416 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036512537612838516, + "loss": 3.0584, + "theoretical_loss": 3.680650853258755, + "tokens_seen": 914429952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036511534603811434, + "loss": 2.7957, + "theoretical_loss": 3.680625333875259, + "tokens_seen": 914495488 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003651053159478435, + "loss": 2.9515, + "theoretical_loss": 3.680599816832532, + "tokens_seen": 914561024 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036509528585757276, + "loss": 2.9673, + "theoretical_loss": 3.680574302130192, + "tokens_seen": 914626560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003650852557673019, + "loss": 3.0117, + "theoretical_loss": 3.680548789767856, + "tokens_seen": 914692096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003650752256770311, + "loss": 2.9848, + "theoretical_loss": 3.680523279745142, + "tokens_seen": 914757632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036506519558676025, + "loss": 2.9345, + "theoretical_loss": 3.6804977720616687, + "tokens_seen": 914823168 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003650551654964895, + "loss": 2.9058, + "theoretical_loss": 3.680472266717053, + "tokens_seen": 914888704 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036504513540621867, + "loss": 3.0667, + "theoretical_loss": 3.6804467637109135, + "tokens_seen": 914954240 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036503510531594785, + "loss": 2.9254, + "theoretical_loss": 3.6804212630428683, + "tokens_seen": 915019776 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036502507522567703, + "loss": 2.9236, + "theoretical_loss": 3.680395764712535, + "tokens_seen": 915085312 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003650150451354062, + "loss": 3.102, + "theoretical_loss": 3.6803702687195328, + "tokens_seen": 915150848 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003650050150451354, + "loss": 2.9154, + "theoretical_loss": 3.6803447750634795, + "tokens_seen": 915216384 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003649949849548646, + "loss": 2.6507, + "theoretical_loss": 3.6803192837439935, + "tokens_seen": 915281920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036498495486459375, + "loss": 2.975, + "theoretical_loss": 3.6802937947606935, + "tokens_seen": 915347456 + }, + { + "epoch": 2.07, + "learning_rate": 0.000364974924774323, + "loss": 2.9202, + "theoretical_loss": 3.680268308113199, + "tokens_seen": 915412992 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003649648946840521, + "loss": 2.8818, + "theoretical_loss": 3.680242823801127, + "tokens_seen": 915478528 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036495486459378135, + "loss": 2.8981, + "theoretical_loss": 3.6802173418240978, + "tokens_seen": 915544064 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036494483450351053, + "loss": 3.03, + "theoretical_loss": 3.68019186218173, + "tokens_seen": 915609600 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003649348044132397, + "loss": 3.0329, + "theoretical_loss": 3.680166384873642, + "tokens_seen": 915675136 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003649247743229689, + "loss": 3.0494, + "theoretical_loss": 3.6801409098994537, + "tokens_seen": 915740672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036491474423269813, + "loss": 2.9482, + "theoretical_loss": 3.6801154372587837, + "tokens_seen": 915806208 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1475506, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7651333808898926, + "objective/train/theoretical_loss": 3.680102701813399, + "objective/train/tokens_used": 936298976, + "theoretical_loss": 3.680102701813399, + "tokens_seen": 915838976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036490471414242726, + "loss": 2.863, + "theoretical_loss": 3.680089966951252, + "tokens_seen": 915871744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648946840521565, + "loss": 2.9001, + "theoretical_loss": 3.680064498976477, + "tokens_seen": 915937280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648846539618856, + "loss": 2.8836, + "theoretical_loss": 3.6800390333340793, + "tokens_seen": 916002816 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036487462387161485, + "loss": 2.8449, + "theoretical_loss": 3.680013570023677, + "tokens_seen": 916068352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648645937813441, + "loss": 2.9057, + "theoretical_loss": 3.679988109044891, + "tokens_seen": 916133888 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648545636910732, + "loss": 3.0943, + "theoretical_loss": 3.6799626503973406, + "tokens_seen": 916199424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036484453360080245, + "loss": 2.9128, + "theoretical_loss": 3.6799371940806456, + "tokens_seen": 916264960 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648345035105316, + "loss": 3.01, + "theoretical_loss": 3.679911740094426, + "tokens_seen": 916330496 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648244734202608, + "loss": 2.9061, + "theoretical_loss": 3.6798862884383015, + "tokens_seen": 916396032 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036481444332999, + "loss": 2.9631, + "theoretical_loss": 3.6798608391118925, + "tokens_seen": 916461568 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003648044132397192, + "loss": 2.6949, + "theoretical_loss": 3.6798353921148195, + "tokens_seen": 916527104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036479438314944836, + "loss": 3.0906, + "theoretical_loss": 3.6798099474467016, + "tokens_seen": 916592640 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036478435305917754, + "loss": 2.9806, + "theoretical_loss": 3.6797845051071603, + "tokens_seen": 916658176 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003647743229689067, + "loss": 3.0624, + "theoretical_loss": 3.6797590650958156, + "tokens_seen": 916723712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036476429287863595, + "loss": 2.9112, + "theoretical_loss": 3.679733627412288, + "tokens_seen": 916789248 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003647542627883651, + "loss": 3.0284, + "theoretical_loss": 3.6797081920561983, + "tokens_seen": 916854784 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003647442326980943, + "loss": 2.8444, + "theoretical_loss": 3.679682759027167, + "tokens_seen": 916920320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003647342026078235, + "loss": 2.8797, + "theoretical_loss": 3.679657328324815, + "tokens_seen": 916985856 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003647241725175527, + "loss": 2.928, + "theoretical_loss": 3.679631899948763, + "tokens_seen": 917051392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036471414242728186, + "loss": 2.8992, + "theoretical_loss": 3.679606473898632, + "tokens_seen": 917116928 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036470411233701104, + "loss": 3.0108, + "theoretical_loss": 3.679581050174044, + "tokens_seen": 917182464 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003646940822467402, + "loss": 2.861, + "theoretical_loss": 3.6795556287746187, + "tokens_seen": 917248000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036468405215646946, + "loss": 2.8932, + "theoretical_loss": 3.679530209699978, + "tokens_seen": 917313536 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003646740220661986, + "loss": 2.8733, + "theoretical_loss": 3.6795047929497438, + "tokens_seen": 917379072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003646639919759278, + "loss": 2.9893, + "theoretical_loss": 3.6794793785235367, + "tokens_seen": 917444608 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1478457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9837417602539062, + "objective/train/theoretical_loss": 3.6794666721818245, + "objective/train/tokens_used": 937937376, + "theoretical_loss": 3.6794666721818245, + "tokens_seen": 917477376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036465396188565695, + "loss": 2.8503, + "theoretical_loss": 3.6794539664209784, + "tokens_seen": 917510144 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003646439317953862, + "loss": 2.9427, + "theoretical_loss": 3.6794285566416907, + "tokens_seen": 917575680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036463390170511536, + "loss": 2.981, + "theoretical_loss": 3.679403149185295, + "tokens_seen": 917641216 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036462387161484454, + "loss": 2.9825, + "theoretical_loss": 3.679377744051413, + "tokens_seen": 917706752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003646138415245737, + "loss": 2.934, + "theoretical_loss": 3.6793523412396674, + "tokens_seen": 917772288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036460381143430296, + "loss": 2.8182, + "theoretical_loss": 3.6793269407496787, + "tokens_seen": 917837824 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645937813440321, + "loss": 2.8748, + "theoretical_loss": 3.6793015425810704, + "tokens_seen": 917903360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645837512537613, + "loss": 2.579, + "theoretical_loss": 3.6792761467334643, + "tokens_seen": 917968896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036457372116349045, + "loss": 2.8218, + "theoretical_loss": 3.6792507532064818, + "tokens_seen": 918034432 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645636910732197, + "loss": 2.8441, + "theoretical_loss": 3.679225361999746, + "tokens_seen": 918099968 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036455366098294887, + "loss": 3.0289, + "theoretical_loss": 3.679199973112879, + "tokens_seen": 918165504 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036454363089267805, + "loss": 2.8112, + "theoretical_loss": 3.6791745865455026, + "tokens_seen": 918231040 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036453360080240723, + "loss": 2.9692, + "theoretical_loss": 3.679149202297241, + "tokens_seen": 918296576 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645235707121364, + "loss": 2.9543, + "theoretical_loss": 3.6791238203677157, + "tokens_seen": 918362112 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645135406218656, + "loss": 2.8888, + "theoretical_loss": 3.6790984407565497, + "tokens_seen": 918427648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003645035105315948, + "loss": 2.9583, + "theoretical_loss": 3.679073063463366, + "tokens_seen": 918493184 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036449348044132395, + "loss": 2.9203, + "theoretical_loss": 3.6790476884877865, + "tokens_seen": 918558720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003644834503510532, + "loss": 2.9412, + "theoretical_loss": 3.679022315829436, + "tokens_seen": 918624256 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003644734202607823, + "loss": 2.7926, + "theoretical_loss": 3.6789969454879365, + "tokens_seen": 918689792 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036446339017051155, + "loss": 2.7415, + "theoretical_loss": 3.678971577462911, + "tokens_seen": 918755328 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036445336008024073, + "loss": 2.9253, + "theoretical_loss": 3.678946211753983, + "tokens_seen": 918820864 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003644433299899699, + "loss": 2.8123, + "theoretical_loss": 3.6789208483607765, + "tokens_seen": 918886400 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003644332998996991, + "loss": 2.9062, + "theoretical_loss": 3.6788954872829143, + "tokens_seen": 918951936 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036442326980942833, + "loss": 2.9722, + "theoretical_loss": 3.6788701285200203, + "tokens_seen": 919017472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036441323971915746, + "loss": 2.9568, + "theoretical_loss": 3.6788447720717175, + "tokens_seen": 919083008 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1479799, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.944450855255127, + "objective/train/theoretical_loss": 3.6788320947154203, + "objective/train/tokens_used": 939575776, + "theoretical_loss": 3.6788320947154203, + "tokens_seen": 919115776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003644032096288867, + "loss": 2.8947, + "theoretical_loss": 3.67881941793763, + "tokens_seen": 919148544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643931795386158, + "loss": 2.8012, + "theoretical_loss": 3.6787940661173817, + "tokens_seen": 919214080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036438314944834505, + "loss": 2.881, + "theoretical_loss": 3.6787687166105965, + "tokens_seen": 919279616 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036437311935807424, + "loss": 2.9148, + "theoretical_loss": 3.6787433694168987, + "tokens_seen": 919345152 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643630892678034, + "loss": 2.9779, + "theoretical_loss": 3.6787180245359115, + "tokens_seen": 919410688 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643530591775326, + "loss": 2.7925, + "theoretical_loss": 3.6786926819672594, + "tokens_seen": 919476224 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643430290872618, + "loss": 2.8835, + "theoretical_loss": 3.6786673417105673, + "tokens_seen": 919541760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036433299899699096, + "loss": 3.0246, + "theoretical_loss": 3.678642003765459, + "tokens_seen": 919607296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643229689067202, + "loss": 3.0127, + "theoretical_loss": 3.6786166681315584, + "tokens_seen": 919672832 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003643129388164493, + "loss": 2.9011, + "theoretical_loss": 3.6785913348084907, + "tokens_seen": 919738368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036430290872617856, + "loss": 2.8633, + "theoretical_loss": 3.6785660037958805, + "tokens_seen": 919803904 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642928786359077, + "loss": 3.0197, + "theoretical_loss": 3.6785406750933527, + "tokens_seen": 919869440 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642828485456369, + "loss": 3.0166, + "theoretical_loss": 3.6785153487005307, + "tokens_seen": 919934976 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642728184553661, + "loss": 2.982, + "theoretical_loss": 3.678490024617041, + "tokens_seen": 920000512 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642627883650953, + "loss": 2.8305, + "theoretical_loss": 3.6784647028425077, + "tokens_seen": 920066048 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036425275827482446, + "loss": 2.9483, + "theoretical_loss": 3.6784393833765563, + "tokens_seen": 920131584 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642427281845537, + "loss": 3.0808, + "theoretical_loss": 3.678414066218812, + "tokens_seen": 920197120 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642326980942828, + "loss": 2.9717, + "theoretical_loss": 3.6783887513688986, + "tokens_seen": 920262656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036422266800401206, + "loss": 3.1061, + "theoretical_loss": 3.6783634388264432, + "tokens_seen": 920328192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642126379137412, + "loss": 2.9115, + "theoretical_loss": 3.6783381285910703, + "tokens_seen": 920393728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003642026078234704, + "loss": 2.88, + "theoretical_loss": 3.6783128206624056, + "tokens_seen": 920459264 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003641925777331996, + "loss": 2.7831, + "theoretical_loss": 3.6782875150400742, + "tokens_seen": 920524800 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003641825476429288, + "loss": 3.0219, + "theoretical_loss": 3.6782622117237027, + "tokens_seen": 920590336 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036417251755265797, + "loss": 2.9251, + "theoretical_loss": 3.678236910712916, + "tokens_seen": 920655872 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036416248746238715, + "loss": 2.8486, + "theoretical_loss": 3.6782116120073396, + "tokens_seen": 920721408 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1482422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8843259811401367, + "objective/train/theoretical_loss": 3.6781989635188888, + "objective/train/tokens_used": 941214176, + "theoretical_loss": 3.6781989635188888, + "tokens_seen": 920754176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036415245737211633, + "loss": 2.8763, + "theoretical_loss": 3.6781863156066006, + "tokens_seen": 920786944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036414242728184556, + "loss": 2.863, + "theoretical_loss": 3.6781610215103244, + "tokens_seen": 920852480 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003641323971915747, + "loss": 2.9367, + "theoretical_loss": 3.6781357297181367, + "tokens_seen": 920918016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003641223671013039, + "loss": 2.9602, + "theoretical_loss": 3.6781104402296645, + "tokens_seen": 920983552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036411233701103316, + "loss": 2.9044, + "theoretical_loss": 3.6780851530445333, + "tokens_seen": 921049088 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003641023069207623, + "loss": 2.9418, + "theoretical_loss": 3.6780598681623697, + "tokens_seen": 921114624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003640922768304915, + "loss": 2.8521, + "theoretical_loss": 3.6780345855828003, + "tokens_seen": 921180160 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036408224674022065, + "loss": 3.0157, + "theoretical_loss": 3.6780093053054514, + "tokens_seen": 921245696 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003640722166499499, + "loss": 2.9144, + "theoretical_loss": 3.67798402732995, + "tokens_seen": 921311232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036406218655967907, + "loss": 2.922, + "theoretical_loss": 3.677958751655922, + "tokens_seen": 921376768 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036405215646940825, + "loss": 2.7883, + "theoretical_loss": 3.677933478282995, + "tokens_seen": 921442304 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036404212637913743, + "loss": 2.9138, + "theoretical_loss": 3.6779082072107956, + "tokens_seen": 921507840 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003640320962888666, + "loss": 2.8221, + "theoretical_loss": 3.67788293843895, + "tokens_seen": 921573376 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003640220661985958, + "loss": 2.9524, + "theoretical_loss": 3.677857671967087, + "tokens_seen": 921638912 + }, + { + "epoch": 2.07, + "learning_rate": 0.000364012036108325, + "loss": 2.8837, + "theoretical_loss": 3.677832407794832, + "tokens_seen": 921704448 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036400200601805415, + "loss": 3.0298, + "theoretical_loss": 3.677807145921813, + "tokens_seen": 921769984 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003639919759277834, + "loss": 2.9094, + "theoretical_loss": 3.677781886347657, + "tokens_seen": 921835520 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003639819458375125, + "loss": 2.9194, + "theoretical_loss": 3.677756629071992, + "tokens_seen": 921901056 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036397191574724175, + "loss": 2.9384, + "theoretical_loss": 3.677731374094445, + "tokens_seen": 921966592 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036396188565697093, + "loss": 2.9087, + "theoretical_loss": 3.6777061214146434, + "tokens_seen": 922032128 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003639518555667001, + "loss": 2.8852, + "theoretical_loss": 3.677680871032215, + "tokens_seen": 922097664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003639418254764293, + "loss": 2.8809, + "theoretical_loss": 3.677655622946787, + "tokens_seen": 922163200 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036393179538615853, + "loss": 2.9193, + "theoretical_loss": 3.677630377157989, + "tokens_seen": 922228736 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036392176529588766, + "loss": 2.918, + "theoretical_loss": 3.6776051336654474, + "tokens_seen": 922294272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003639117352056169, + "loss": 2.8992, + "theoretical_loss": 3.67757989246879, + "tokens_seen": 922359808 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1485252, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8207075595855713, + "objective/train/theoretical_loss": 3.6775672727313022, + "objective/train/tokens_used": 942852576, + "theoretical_loss": 3.6775672727313022, + "tokens_seen": 922392576 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363901705115346, + "loss": 3.0145, + "theoretical_loss": 3.677554653567646, + "tokens_seen": 922425344 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036389167502507525, + "loss": 2.8877, + "theoretical_loss": 3.677529416961643, + "tokens_seen": 922490880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036388164493480444, + "loss": 2.9163, + "theoretical_loss": 3.6775041826504093, + "tokens_seen": 922556416 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003638716148445336, + "loss": 2.7915, + "theoretical_loss": 3.6774789506335726, + "tokens_seen": 922621952 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003638615847542628, + "loss": 2.8027, + "theoretical_loss": 3.6774537209107625, + "tokens_seen": 922687488 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363851554663992, + "loss": 3.0021, + "theoretical_loss": 3.6774284934816066, + "tokens_seen": 922753024 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036384152457372116, + "loss": 3.025, + "theoretical_loss": 3.677403268345734, + "tokens_seen": 922818560 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003638314944834504, + "loss": 2.8379, + "theoretical_loss": 3.677378045502773, + "tokens_seen": 922884096 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003638214643931795, + "loss": 3.0154, + "theoretical_loss": 3.677352824952353, + "tokens_seen": 922949632 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036381143430290876, + "loss": 2.8672, + "theoretical_loss": 3.677327606694102, + "tokens_seen": 923015168 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003638014042126379, + "loss": 2.9863, + "theoretical_loss": 3.6773023907276503, + "tokens_seen": 923080704 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637913741223671, + "loss": 2.8994, + "theoretical_loss": 3.677277177052625, + "tokens_seen": 923146240 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637813440320963, + "loss": 2.8247, + "theoretical_loss": 3.677251965668657, + "tokens_seen": 923211776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637713139418255, + "loss": 2.8621, + "theoretical_loss": 3.6772267565753745, + "tokens_seen": 923277312 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036376128385155466, + "loss": 2.9341, + "theoretical_loss": 3.6772015497724073, + "tokens_seen": 923342848 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637512537612839, + "loss": 2.9049, + "theoretical_loss": 3.677176345259384, + "tokens_seen": 923408384 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363741223671013, + "loss": 2.9914, + "theoretical_loss": 3.6771511430359354, + "tokens_seen": 923473920 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036373119358074226, + "loss": 2.9869, + "theoretical_loss": 3.6771259431016894, + "tokens_seen": 923539456 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637211634904714, + "loss": 3.0388, + "theoretical_loss": 3.6771007454562774, + "tokens_seen": 923604992 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637111334002006, + "loss": 2.9546, + "theoretical_loss": 3.677075550099328, + "tokens_seen": 923670528 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003637011033099298, + "loss": 3.0134, + "theoretical_loss": 3.677050357030471, + "tokens_seen": 923736064 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363691073219659, + "loss": 3.0341, + "theoretical_loss": 3.6770251662493365, + "tokens_seen": 923801600 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036368104312938817, + "loss": 2.9303, + "theoretical_loss": 3.676999977755555, + "tokens_seen": 923867136 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036367101303911735, + "loss": 3.0213, + "theoretical_loss": 3.676974791548756, + "tokens_seen": 923932672 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036366098294884653, + "loss": 2.8875, + "theoretical_loss": 3.6769496076285693, + "tokens_seen": 923998208 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1487983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1669974327087402, + "objective/train/theoretical_loss": 3.6769370165258404, + "objective/train/tokens_used": 944490976, + "theoretical_loss": 3.6769370165258404, + "tokens_seen": 924030976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036365095285857576, + "loss": 2.9463, + "theoretical_loss": 3.676924425994626, + "tokens_seen": 924063744 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003636409227683049, + "loss": 2.988, + "theoretical_loss": 3.676899246646556, + "tokens_seen": 924129280 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003636308926780341, + "loss": 2.8919, + "theoretical_loss": 3.67687406958399, + "tokens_seen": 924194816 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036362086258776325, + "loss": 3.0611, + "theoretical_loss": 3.676848894806558, + "tokens_seen": 924260352 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003636108324974925, + "loss": 2.9333, + "theoretical_loss": 3.676823722313891, + "tokens_seen": 924325888 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036360080240722167, + "loss": 2.9274, + "theoretical_loss": 3.6767985521056192, + "tokens_seen": 924391424 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036359077231695085, + "loss": 2.9843, + "theoretical_loss": 3.6767733841813737, + "tokens_seen": 924456960 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036358074222668003, + "loss": 2.8525, + "theoretical_loss": 3.676748218540786, + "tokens_seen": 924522496 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036357071213640927, + "loss": 2.935, + "theoretical_loss": 3.6767230551834857, + "tokens_seen": 924588032 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003635606820461384, + "loss": 2.8093, + "theoretical_loss": 3.676697894109105, + "tokens_seen": 924653568 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036355065195586763, + "loss": 2.8407, + "theoretical_loss": 3.6766727353172737, + "tokens_seen": 924719104 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036354062186559676, + "loss": 2.7735, + "theoretical_loss": 3.6766475788076245, + "tokens_seen": 924784640 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363530591775326, + "loss": 2.7351, + "theoretical_loss": 3.6766224245797874, + "tokens_seen": 924850176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036352056168505517, + "loss": 2.8052, + "theoretical_loss": 3.6765972726333946, + "tokens_seen": 924915712 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036351053159478435, + "loss": 2.8008, + "theoretical_loss": 3.6765721229680772, + "tokens_seen": 924981248 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036350050150451353, + "loss": 2.888, + "theoretical_loss": 3.6765469755834665, + "tokens_seen": 925046784 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634904714142427, + "loss": 2.8944, + "theoretical_loss": 3.676521830479195, + "tokens_seen": 925112320 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634804413239719, + "loss": 2.9959, + "theoretical_loss": 3.6764966876548932, + "tokens_seen": 925177856 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036347041123370113, + "loss": 2.9403, + "theoretical_loss": 3.676471547110194, + "tokens_seen": 925243392 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036346038114343026, + "loss": 3.0334, + "theoretical_loss": 3.676446408844728, + "tokens_seen": 925308928 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634503510531595, + "loss": 2.7191, + "theoretical_loss": 3.6764212728581285, + "tokens_seen": 925374464 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634403209628886, + "loss": 3.1562, + "theoretical_loss": 3.676396139150027, + "tokens_seen": 925440000 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036343029087261786, + "loss": 2.9673, + "theoretical_loss": 3.6763710077200553, + "tokens_seen": 925505536 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036342026078234704, + "loss": 2.9718, + "theoretical_loss": 3.6763458785678456, + "tokens_seen": 925571072 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634102306920762, + "loss": 2.7644, + "theoretical_loss": 3.676320751693031, + "tokens_seen": 925636608 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1490702, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.873175859451294, + "objective/train/theoretical_loss": 3.6763081891095317, + "objective/train/tokens_used": 946129376, + "theoretical_loss": 3.6763081891095317, + "tokens_seen": 925669376 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003634002006018054, + "loss": 2.9076, + "theoretical_loss": 3.676295627095243, + "tokens_seen": 925702144 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036339017051153464, + "loss": 2.9272, + "theoretical_loss": 3.6762705047741147, + "tokens_seen": 925767680 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036338014042126376, + "loss": 2.9403, + "theoretical_loss": 3.6762453847292784, + "tokens_seen": 925833216 + }, + { + "epoch": 2.07, + "learning_rate": 0.000363370110330993, + "loss": 2.9987, + "theoretical_loss": 3.676220266960367, + "tokens_seen": 925898752 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003633600802407222, + "loss": 2.8445, + "theoretical_loss": 3.6761951514670126, + "tokens_seen": 925964288 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036335005015045136, + "loss": 2.9427, + "theoretical_loss": 3.6761700382488485, + "tokens_seen": 926029824 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003633400200601806, + "loss": 2.8592, + "theoretical_loss": 3.6761449273055073, + "tokens_seen": 926095360 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003633299899699097, + "loss": 3.008, + "theoretical_loss": 3.6761198186366224, + "tokens_seen": 926160896 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036331995987963896, + "loss": 2.8733, + "theoretical_loss": 3.6760947122418264, + "tokens_seen": 926226432 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003633099297893681, + "loss": 2.8579, + "theoretical_loss": 3.676069608120753, + "tokens_seen": 926291968 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632998996990973, + "loss": 2.8369, + "theoretical_loss": 3.6760445062730347, + "tokens_seen": 926357504 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632898696088265, + "loss": 3.0368, + "theoretical_loss": 3.6760194066983054, + "tokens_seen": 926423040 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632798395185557, + "loss": 2.8716, + "theoretical_loss": 3.6759943093961986, + "tokens_seen": 926488576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036326980942828486, + "loss": 2.9947, + "theoretical_loss": 3.675969214366347, + "tokens_seen": 926554112 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632597793380141, + "loss": 2.9311, + "theoretical_loss": 3.6759441216083855, + "tokens_seen": 926619648 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632497492477432, + "loss": 2.9707, + "theoretical_loss": 3.6759190311219467, + "tokens_seen": 926685184 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036323971915747246, + "loss": 2.9558, + "theoretical_loss": 3.6758939429066646, + "tokens_seen": 926750720 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632296890672016, + "loss": 2.8636, + "theoretical_loss": 3.6758688569621727, + "tokens_seen": 926816256 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003632196589769308, + "loss": 2.741, + "theoretical_loss": 3.6758437732881055, + "tokens_seen": 926881792 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036320962888666, + "loss": 2.7041, + "theoretical_loss": 3.675818691884097, + "tokens_seen": 926947328 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003631995987963892, + "loss": 2.8922, + "theoretical_loss": 3.675793612749781, + "tokens_seen": 927012864 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036318956870611837, + "loss": 2.739, + "theoretical_loss": 3.6757685358847914, + "tokens_seen": 927078400 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036317953861584755, + "loss": 2.7865, + "theoretical_loss": 3.6757434612887634, + "tokens_seen": 927143936 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036316950852557673, + "loss": 2.9785, + "theoretical_loss": 3.6757183889613305, + "tokens_seen": 927209472 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036315947843530596, + "loss": 2.9216, + "theoretical_loss": 3.675693318902127, + "tokens_seen": 927275008 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1493732, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.150300979614258, + "objective/train/theoretical_loss": 3.6756807847229975, + "objective/train/tokens_used": 947767776, + "theoretical_loss": 3.6756807847229975, + "tokens_seen": 927307776 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003631494483450351, + "loss": 2.8418, + "theoretical_loss": 3.675668251110788, + "tokens_seen": 927340544 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003631394182547643, + "loss": 2.7748, + "theoretical_loss": 3.6756431855869485, + "tokens_seen": 927406080 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036312938816449345, + "loss": 2.7081, + "theoretical_loss": 3.675618122330242, + "tokens_seen": 927471616 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003631193580742227, + "loss": 2.9793, + "theoretical_loss": 3.6755930613403036, + "tokens_seen": 927537152 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036310932798395187, + "loss": 2.9792, + "theoretical_loss": 3.6755680026167683, + "tokens_seen": 927602688 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036309929789368105, + "loss": 2.7728, + "theoretical_loss": 3.675542946159272, + "tokens_seen": 927668224 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036308926780341023, + "loss": 2.7771, + "theoretical_loss": 3.6755178919674476, + "tokens_seen": 927733760 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036307923771313947, + "loss": 2.9408, + "theoretical_loss": 3.675492840040932, + "tokens_seen": 927799296 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003630692076228686, + "loss": 2.7692, + "theoretical_loss": 3.6754677903793604, + "tokens_seen": 927864832 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036305917753259783, + "loss": 2.8193, + "theoretical_loss": 3.675442742982367, + "tokens_seen": 927930368 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036304914744232696, + "loss": 2.9084, + "theoretical_loss": 3.6754176978495874, + "tokens_seen": 927995904 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003630391173520562, + "loss": 2.962, + "theoretical_loss": 3.6753926549806577, + "tokens_seen": 928061440 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036302908726178537, + "loss": 2.8966, + "theoretical_loss": 3.675367614375213, + "tokens_seen": 928126976 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036301905717151455, + "loss": 2.8822, + "theoretical_loss": 3.6753425760328886, + "tokens_seen": 928192512 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036300902708124374, + "loss": 2.8086, + "theoretical_loss": 3.675317539953321, + "tokens_seen": 928258048 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629989969909729, + "loss": 2.7896, + "theoretical_loss": 3.675292506136145, + "tokens_seen": 928323584 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629889669007021, + "loss": 2.7382, + "theoretical_loss": 3.6752674745809975, + "tokens_seen": 928389120 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036297893681043133, + "loss": 3.034, + "theoretical_loss": 3.6752424452875134, + "tokens_seen": 928454656 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036296890672016046, + "loss": 2.8688, + "theoretical_loss": 3.67521741825533, + "tokens_seen": 928520192 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629588766298897, + "loss": 2.9838, + "theoretical_loss": 3.6751923934840818, + "tokens_seen": 928585728 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629488465396188, + "loss": 2.9335, + "theoretical_loss": 3.6751673709734067, + "tokens_seen": 928651264 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036293881644934806, + "loss": 2.9566, + "theoretical_loss": 3.6751423507229397, + "tokens_seen": 928716800 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036292878635907724, + "loss": 2.6759, + "theoretical_loss": 3.6751173327323174, + "tokens_seen": 928782336 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629187562688064, + "loss": 2.9578, + "theoretical_loss": 3.6750923170011767, + "tokens_seen": 928847872 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003629087261785356, + "loss": 2.8522, + "theoretical_loss": 3.675067303529154, + "tokens_seen": 928913408 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1495096, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9690494537353516, + "objective/train/theoretical_loss": 3.6750547976401986, + "objective/train/tokens_used": 949406176, + "theoretical_loss": 3.6750547976401986, + "tokens_seen": 928946176 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036289869608826484, + "loss": 2.8616, + "theoretical_loss": 3.6750422923158856, + "tokens_seen": 928978944 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036288866599799396, + "loss": 2.9689, + "theoretical_loss": 3.675017283361009, + "tokens_seen": 929044480 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003628786359077232, + "loss": 2.8997, + "theoretical_loss": 3.6749922766641596, + "tokens_seen": 929110016 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003628686058174523, + "loss": 2.8885, + "theoretical_loss": 3.6749672722249755, + "tokens_seen": 929175552 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036285857572718156, + "loss": 2.9855, + "theoretical_loss": 3.674942270043094, + "tokens_seen": 929241088 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036284854563691074, + "loss": 2.7534, + "theoretical_loss": 3.6749172701181507, + "tokens_seen": 929306624 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003628385155466399, + "loss": 2.8674, + "theoretical_loss": 3.6748922724497834, + "tokens_seen": 929372160 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003628284854563691, + "loss": 2.9722, + "theoretical_loss": 3.674867277037629, + "tokens_seen": 929437696 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003628184553660983, + "loss": 2.9303, + "theoretical_loss": 3.6748422838813255, + "tokens_seen": 929503232 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036280842527582747, + "loss": 2.742, + "theoretical_loss": 3.6748172929805105, + "tokens_seen": 929568768 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003627983951855567, + "loss": 2.8014, + "theoretical_loss": 3.6747923043348205, + "tokens_seen": 929634304 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036278836509528583, + "loss": 2.8539, + "theoretical_loss": 3.674767317943893, + "tokens_seen": 929699840 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036277833500501506, + "loss": 2.9365, + "theoretical_loss": 3.6747423338073664, + "tokens_seen": 929765376 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036276830491474424, + "loss": 3.0801, + "theoretical_loss": 3.674717351924878, + "tokens_seen": 929830912 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003627582748244734, + "loss": 2.7656, + "theoretical_loss": 3.674692372296066, + "tokens_seen": 929896448 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003627482447342026, + "loss": 2.8095, + "theoretical_loss": 3.6746673949205677, + "tokens_seen": 929961984 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003627382146439318, + "loss": 2.9121, + "theoretical_loss": 3.6746424197980208, + "tokens_seen": 930027520 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036272818455366097, + "loss": 3.0254, + "theoretical_loss": 3.6746174469280644, + "tokens_seen": 930093056 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003627181544633902, + "loss": 2.9212, + "theoretical_loss": 3.6745924763103357, + "tokens_seen": 930158592 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036270812437311933, + "loss": 2.7991, + "theoretical_loss": 3.6745675079444737, + "tokens_seen": 930224128 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036269809428284857, + "loss": 2.7959, + "theoretical_loss": 3.6745425418301165, + "tokens_seen": 930289664 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003626880641925777, + "loss": 2.7001, + "theoretical_loss": 3.6745175779669017, + "tokens_seen": 930355200 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036267803410230693, + "loss": 2.968, + "theoretical_loss": 3.6744926163544687, + "tokens_seen": 930420736 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003626680040120361, + "loss": 2.8351, + "theoretical_loss": 3.6744676569924555, + "tokens_seen": 930486272 + }, + { + "epoch": 2.07, + "learning_rate": 0.0003626579739217653, + "loss": 2.9072, + "theoretical_loss": 3.674442699880501, + "tokens_seen": 930551808 + }, + { + "epoch": 2.07, + "objective/train/docs_used": 1497971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.057682752609253, + "objective/train/theoretical_loss": 3.6744302221681826, + "objective/train/tokens_used": 951044576, + "theoretical_loss": 3.6744302221681826, + "tokens_seen": 930584576 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036264794383149447, + "loss": 2.7908, + "theoretical_loss": 3.674417745018244, + "tokens_seen": 930617344 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036263791374122365, + "loss": 2.8916, + "theoretical_loss": 3.674392792405323, + "tokens_seen": 930682880 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036262788365095283, + "loss": 2.799, + "theoretical_loss": 3.6743678420413772, + "tokens_seen": 930748416 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036261785356068207, + "loss": 3.007, + "theoretical_loss": 3.674342893926045, + "tokens_seen": 930813952 + }, + { + "epoch": 2.07, + "learning_rate": 0.00036260782347041125, + "loss": 2.9529, + "theoretical_loss": 3.674317948058966, + "tokens_seen": 930879488 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036259779338014043, + "loss": 2.8339, + "theoretical_loss": 3.6742930044397797, + "tokens_seen": 930945024 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036258776328986967, + "loss": 2.8906, + "theoretical_loss": 3.6742680630681237, + "tokens_seen": 931010560 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003625777331995988, + "loss": 2.975, + "theoretical_loss": 3.6742431239436395, + "tokens_seen": 931076096 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036256770310932803, + "loss": 3.0627, + "theoretical_loss": 3.6742181870659647, + "tokens_seen": 931141632 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036255767301905716, + "loss": 2.6937, + "theoretical_loss": 3.6741932524347396, + "tokens_seen": 931207168 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003625476429287864, + "loss": 2.8874, + "theoretical_loss": 3.6741683200496036, + "tokens_seen": 931272704 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003625376128385156, + "loss": 2.7874, + "theoretical_loss": 3.6741433899101965, + "tokens_seen": 931338240 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036252758274824475, + "loss": 2.9792, + "theoretical_loss": 3.6741184620161578, + "tokens_seen": 931403776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036251755265797394, + "loss": 3.1259, + "theoretical_loss": 3.674093536367127, + "tokens_seen": 931469312 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003625075225677031, + "loss": 2.8565, + "theoretical_loss": 3.6740686129627447, + "tokens_seen": 931534848 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003624974924774323, + "loss": 2.9293, + "theoretical_loss": 3.674043691802651, + "tokens_seen": 931600384 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036248746238716153, + "loss": 2.9639, + "theoretical_loss": 3.6740187728864844, + "tokens_seen": 931665920 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036247743229689066, + "loss": 2.9456, + "theoretical_loss": 3.6739938562138867, + "tokens_seen": 931731456 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003624674022066199, + "loss": 2.8468, + "theoretical_loss": 3.673968941784497, + "tokens_seen": 931796992 + }, + { + "epoch": 2.08, + "learning_rate": 0.000362457372116349, + "loss": 2.9117, + "theoretical_loss": 3.6739440295979566, + "tokens_seen": 931862528 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036244734202607826, + "loss": 2.8637, + "theoretical_loss": 3.673919119653905, + "tokens_seen": 931928064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036243731193580744, + "loss": 2.8739, + "theoretical_loss": 3.673894211951983, + "tokens_seen": 931993600 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003624272818455366, + "loss": 2.9402, + "theoretical_loss": 3.6738693064918313, + "tokens_seen": 932059136 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003624172517552658, + "loss": 3.0169, + "theoretical_loss": 3.6738444032730904, + "tokens_seen": 932124672 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036240722166499504, + "loss": 2.8236, + "theoretical_loss": 3.6738195022954008, + "tokens_seen": 932190208 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1500859, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0026113986968994, + "objective/train/theoretical_loss": 3.6738070526468385, + "objective/train/tokens_used": 952682976, + "theoretical_loss": 3.6738070526468385, + "tokens_seen": 932222976 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036239719157472416, + "loss": 2.9042, + "theoretical_loss": 3.673794603558404, + "tokens_seen": 932255744 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623871614844534, + "loss": 2.8567, + "theoretical_loss": 3.67376970706174, + "tokens_seen": 932321280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623771313941825, + "loss": 2.9608, + "theoretical_loss": 3.67374481280505, + "tokens_seen": 932386816 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036236710130391176, + "loss": 2.7962, + "theoretical_loss": 3.673719920787976, + "tokens_seen": 932452352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036235707121364094, + "loss": 2.9941, + "theoretical_loss": 3.6736950310101575, + "tokens_seen": 932517888 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623470411233701, + "loss": 2.9175, + "theoretical_loss": 3.6736701434712367, + "tokens_seen": 932583424 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623370110330993, + "loss": 2.713, + "theoretical_loss": 3.673645258170855, + "tokens_seen": 932648960 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623269809428285, + "loss": 2.7606, + "theoretical_loss": 3.673620375108653, + "tokens_seen": 932714496 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036231695085255767, + "loss": 2.8791, + "theoretical_loss": 3.673595494284273, + "tokens_seen": 932780032 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003623069207622869, + "loss": 2.9336, + "theoretical_loss": 3.673570615697356, + "tokens_seen": 932845568 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036229689067201603, + "loss": 2.8461, + "theoretical_loss": 3.673545739347544, + "tokens_seen": 932911104 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036228686058174526, + "loss": 2.9865, + "theoretical_loss": 3.6735208652344786, + "tokens_seen": 932976640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036227683049147444, + "loss": 2.8491, + "theoretical_loss": 3.6734959933578013, + "tokens_seen": 933042176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003622668004012036, + "loss": 2.9981, + "theoretical_loss": 3.673471123717154, + "tokens_seen": 933107712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003622567703109328, + "loss": 2.9626, + "theoretical_loss": 3.673446256312179, + "tokens_seen": 933173248 + }, + { + "epoch": 2.08, + "learning_rate": 0.000362246740220662, + "loss": 2.9147, + "theoretical_loss": 3.6734213911425178, + "tokens_seen": 933238784 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036223671013039117, + "loss": 2.962, + "theoretical_loss": 3.673396528207814, + "tokens_seen": 933304320 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003622266800401204, + "loss": 2.8503, + "theoretical_loss": 3.6733716675077073, + "tokens_seen": 933369856 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036221664994984953, + "loss": 2.8149, + "theoretical_loss": 3.673346809041842, + "tokens_seen": 933435392 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036220661985957877, + "loss": 2.9889, + "theoretical_loss": 3.67332195280986, + "tokens_seen": 933500928 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003621965897693079, + "loss": 2.9139, + "theoretical_loss": 3.6732970988114033, + "tokens_seen": 933566464 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036218655967903713, + "loss": 2.9587, + "theoretical_loss": 3.673272247046114, + "tokens_seen": 933632000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003621765295887663, + "loss": 2.9045, + "theoretical_loss": 3.6732473975136366, + "tokens_seen": 933697536 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003621664994984955, + "loss": 2.876, + "theoretical_loss": 3.673222550213612, + "tokens_seen": 933763072 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036215646940822467, + "loss": 2.9544, + "theoretical_loss": 3.6731977051456837, + "tokens_seen": 933828608 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1503031, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0684561729431152, + "objective/train/theoretical_loss": 3.673185283448644, + "objective/train/tokens_used": 954321376, + "theoretical_loss": 3.673185283448644, + "tokens_seen": 933861376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036214643931795385, + "loss": 2.9818, + "theoretical_loss": 3.673172862309494, + "tokens_seen": 933894144 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036213640922768303, + "loss": 2.8807, + "theoretical_loss": 3.6731480217046872, + "tokens_seen": 933959680 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036212637913741227, + "loss": 2.7561, + "theoretical_loss": 3.6731231833309046, + "tokens_seen": 934025216 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003621163490471414, + "loss": 2.9385, + "theoretical_loss": 3.6730983471877905, + "tokens_seen": 934090752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036210631895687063, + "loss": 3.0013, + "theoretical_loss": 3.6730735132749874, + "tokens_seen": 934156288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003620962888665998, + "loss": 2.8478, + "theoretical_loss": 3.6730486815921393, + "tokens_seen": 934221824 + }, + { + "epoch": 2.08, + "learning_rate": 0.000362086258776329, + "loss": 2.7991, + "theoretical_loss": 3.673023852138889, + "tokens_seen": 934287360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003620762286860582, + "loss": 2.7946, + "theoretical_loss": 3.67299902491488, + "tokens_seen": 934352896 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036206619859578736, + "loss": 2.9228, + "theoretical_loss": 3.6729741999197554, + "tokens_seen": 934418432 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036205616850551654, + "loss": 2.9307, + "theoretical_loss": 3.6729493771531594, + "tokens_seen": 934483968 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003620461384152458, + "loss": 2.9734, + "theoretical_loss": 3.6729245566147357, + "tokens_seen": 934549504 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003620361083249749, + "loss": 2.8652, + "theoretical_loss": 3.6728997383041277, + "tokens_seen": 934615040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036202607823470414, + "loss": 2.7161, + "theoretical_loss": 3.6728749222209798, + "tokens_seen": 934680576 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036201604814443326, + "loss": 2.6862, + "theoretical_loss": 3.672850108364935, + "tokens_seen": 934746112 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003620060180541625, + "loss": 2.9155, + "theoretical_loss": 3.6728252967356383, + "tokens_seen": 934811648 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003619959879638917, + "loss": 2.8927, + "theoretical_loss": 3.6728004873327333, + "tokens_seen": 934877184 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036198595787362086, + "loss": 3.0202, + "theoretical_loss": 3.672775680155864, + "tokens_seen": 934942720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036197592778335004, + "loss": 2.7353, + "theoretical_loss": 3.672750875204675, + "tokens_seen": 935008256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003619658976930792, + "loss": 2.8378, + "theoretical_loss": 3.6727260724788104, + "tokens_seen": 935073792 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003619558676028084, + "loss": 3.0032, + "theoretical_loss": 3.672701271977915, + "tokens_seen": 935139328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036194583751253764, + "loss": 3.0066, + "theoretical_loss": 3.6726764737016326, + "tokens_seen": 935204864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036193580742226677, + "loss": 2.8413, + "theoretical_loss": 3.6726516776496085, + "tokens_seen": 935270400 + }, + { + "epoch": 2.08, + "learning_rate": 0.000361925777331996, + "loss": 2.7503, + "theoretical_loss": 3.6726268838214873, + "tokens_seen": 935335936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003619157472417252, + "loss": 2.9491, + "theoretical_loss": 3.6726020922169136, + "tokens_seen": 935401472 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036190571715145436, + "loss": 2.8264, + "theoretical_loss": 3.6725773028355313, + "tokens_seen": 935467008 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1505779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8629660606384277, + "objective/train/theoretical_loss": 3.6725649089784262, + "objective/train/tokens_used": 955959776, + "theoretical_loss": 3.6725649089784262, + "tokens_seen": 935499776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036189568706118354, + "loss": 2.9628, + "theoretical_loss": 3.672552515676987, + "tokens_seen": 935532544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618856569709127, + "loss": 2.9246, + "theoretical_loss": 3.6725277307409243, + "tokens_seen": 935598080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618756268806419, + "loss": 3.0244, + "theoretical_loss": 3.6725029480269886, + "tokens_seen": 935663616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036186559679037114, + "loss": 3.0001, + "theoretical_loss": 3.6724781675348255, + "tokens_seen": 935729152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618555667001003, + "loss": 2.7454, + "theoretical_loss": 3.67245338926408, + "tokens_seen": 935794688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618455366098295, + "loss": 2.9849, + "theoretical_loss": 3.6724286132143975, + "tokens_seen": 935860224 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618355065195587, + "loss": 2.7745, + "theoretical_loss": 3.672403839385423, + "tokens_seen": 935925760 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036182547642928787, + "loss": 3.0514, + "theoretical_loss": 3.6723790677768022, + "tokens_seen": 935991296 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003618154463390171, + "loss": 2.8933, + "theoretical_loss": 3.672354298388181, + "tokens_seen": 936056832 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036180541624874623, + "loss": 2.8938, + "theoretical_loss": 3.672329531219205, + "tokens_seen": 936122368 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036179538615847546, + "loss": 2.8508, + "theoretical_loss": 3.672304766269519, + "tokens_seen": 936187904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036178535606820464, + "loss": 2.8043, + "theoretical_loss": 3.67228000353877, + "tokens_seen": 936253440 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003617753259779338, + "loss": 2.9193, + "theoretical_loss": 3.672255243026603, + "tokens_seen": 936318976 + }, + { + "epoch": 2.08, + "learning_rate": 0.000361765295887663, + "loss": 2.9592, + "theoretical_loss": 3.6722304847326646, + "tokens_seen": 936384512 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003617552657973922, + "loss": 2.9344, + "theoretical_loss": 3.6722057286566003, + "tokens_seen": 936450048 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036174523570712137, + "loss": 2.9231, + "theoretical_loss": 3.6721809747980565, + "tokens_seen": 936515584 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003617352056168506, + "loss": 3.01, + "theoretical_loss": 3.6721562231566796, + "tokens_seen": 936581120 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036172517552657973, + "loss": 3.0431, + "theoretical_loss": 3.6721314737321156, + "tokens_seen": 936646656 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036171514543630897, + "loss": 2.7721, + "theoretical_loss": 3.672106726524011, + "tokens_seen": 936712192 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003617051153460381, + "loss": 2.8001, + "theoretical_loss": 3.672081981532012, + "tokens_seen": 936777728 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036169508525576733, + "loss": 2.5755, + "theoretical_loss": 3.6720572387557655, + "tokens_seen": 936843264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003616850551654965, + "loss": 2.999, + "theoretical_loss": 3.672032498194918, + "tokens_seen": 936908800 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003616750250752257, + "loss": 2.8727, + "theoretical_loss": 3.672007759849116, + "tokens_seen": 936974336 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036166499498495487, + "loss": 2.8504, + "theoretical_loss": 3.6719830237180067, + "tokens_seen": 937039872 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036165496489468405, + "loss": 2.8309, + "theoretical_loss": 3.6719582898012364, + "tokens_seen": 937105408 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1508733, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9352166652679443, + "objective/train/theoretical_loss": 3.6719459236731185, + "objective/train/tokens_used": 957598176, + "theoretical_loss": 3.6719459236731185, + "tokens_seen": 937138176 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036164493480441323, + "loss": 2.7691, + "theoretical_loss": 3.6719335580984525, + "tokens_seen": 937170944 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036163490471414247, + "loss": 2.8369, + "theoretical_loss": 3.671908828609302, + "tokens_seen": 937236480 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003616248746238716, + "loss": 2.8156, + "theoretical_loss": 3.6718841013334313, + "tokens_seen": 937302016 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036161484453360083, + "loss": 3.0358, + "theoretical_loss": 3.6718593762704885, + "tokens_seen": 937367552 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036160481444333, + "loss": 2.868, + "theoretical_loss": 3.6718346534201203, + "tokens_seen": 937433088 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003615947843530592, + "loss": 2.8578, + "theoretical_loss": 3.6718099327819744, + "tokens_seen": 937498624 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003615847542627884, + "loss": 2.9357, + "theoretical_loss": 3.671785214355698, + "tokens_seen": 937564160 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036157472417251756, + "loss": 3.0752, + "theoretical_loss": 3.671760498140939, + "tokens_seen": 937629696 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036156469408224674, + "loss": 2.8638, + "theoretical_loss": 3.671735784137344, + "tokens_seen": 937695232 + }, + { + "epoch": 2.08, + "learning_rate": 0.000361554663991976, + "loss": 2.8389, + "theoretical_loss": 3.6717110723445616, + "tokens_seen": 937760768 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003615446339017051, + "loss": 2.8237, + "theoretical_loss": 3.671686362762239, + "tokens_seen": 937826304 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036153460381143434, + "loss": 2.9305, + "theoretical_loss": 3.6716616553900248, + "tokens_seen": 937891840 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036152457372116346, + "loss": 2.9394, + "theoretical_loss": 3.6716369502275663, + "tokens_seen": 937957376 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003615145436308927, + "loss": 2.8707, + "theoretical_loss": 3.671612247274511, + "tokens_seen": 938022912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003615045135406219, + "loss": 2.9027, + "theoretical_loss": 3.6715875465305077, + "tokens_seen": 938088448 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036149448345035106, + "loss": 2.8665, + "theoretical_loss": 3.671562847995205, + "tokens_seen": 938153984 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036148445336008024, + "loss": 2.948, + "theoretical_loss": 3.67153815166825, + "tokens_seen": 938219520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003614744232698094, + "loss": 3.0092, + "theoretical_loss": 3.671513457549292, + "tokens_seen": 938285056 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003614643931795386, + "loss": 2.8465, + "theoretical_loss": 3.6714887656379784, + "tokens_seen": 938350592 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036145436308926784, + "loss": 2.8117, + "theoretical_loss": 3.671464075933958, + "tokens_seen": 938416128 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036144433299899697, + "loss": 2.968, + "theoretical_loss": 3.6714393884368803, + "tokens_seen": 938481664 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003614343029087262, + "loss": 2.8889, + "theoretical_loss": 3.6714147031463926, + "tokens_seen": 938547200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003614242728184554, + "loss": 2.9742, + "theoretical_loss": 3.6713900200621445, + "tokens_seen": 938612736 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036141424272818456, + "loss": 2.7703, + "theoretical_loss": 3.6713653391837844, + "tokens_seen": 938678272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036140421263791374, + "loss": 2.8244, + "theoretical_loss": 3.6713406605109613, + "tokens_seen": 938743808 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1511509, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8299081325531006, + "objective/train/theoretical_loss": 3.671328322001517, + "objective/train/tokens_used": 959236576, + "theoretical_loss": 3.671328322001517, + "tokens_seen": 938776576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003613941825476429, + "loss": 2.8549, + "theoretical_loss": 3.671315984043324, + "tokens_seen": 938809344 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003613841524573721, + "loss": 2.8624, + "theoretical_loss": 3.671291309780522, + "tokens_seen": 938874880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036137412236710134, + "loss": 2.8958, + "theoretical_loss": 3.671266637722204, + "tokens_seen": 938940416 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036136409227683047, + "loss": 2.81, + "theoretical_loss": 3.671241967868019, + "tokens_seen": 939005952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003613540621865597, + "loss": 3.0467, + "theoretical_loss": 3.6712173002176165, + "tokens_seen": 939071488 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036134403209628883, + "loss": 3.0029, + "theoretical_loss": 3.6711926347706463, + "tokens_seen": 939137024 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036133400200601807, + "loss": 2.9054, + "theoretical_loss": 3.6711679715267573, + "tokens_seen": 939202560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036132397191574725, + "loss": 2.8446, + "theoretical_loss": 3.671143310485599, + "tokens_seen": 939268096 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036131394182547643, + "loss": 2.9602, + "theoretical_loss": 3.6711186516468213, + "tokens_seen": 939333632 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003613039117352056, + "loss": 3.126, + "theoretical_loss": 3.6710939950100734, + "tokens_seen": 939399168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036129388164493484, + "loss": 3.0691, + "theoretical_loss": 3.6710693405750057, + "tokens_seen": 939464704 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036128385155466397, + "loss": 3.0304, + "theoretical_loss": 3.6710446883412677, + "tokens_seen": 939530240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003612738214643932, + "loss": 3.0335, + "theoretical_loss": 3.6710200383085096, + "tokens_seen": 939595776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036126379137412233, + "loss": 2.9216, + "theoretical_loss": 3.6709953904763806, + "tokens_seen": 939661312 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036125376128385157, + "loss": 2.9095, + "theoretical_loss": 3.670970744844532, + "tokens_seen": 939726848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036124373119358075, + "loss": 2.7017, + "theoretical_loss": 3.670946101412613, + "tokens_seen": 939792384 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036123370110330993, + "loss": 2.8576, + "theoretical_loss": 3.6709214601802738, + "tokens_seen": 939857920 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003612236710130391, + "loss": 2.943, + "theoretical_loss": 3.6708968211471653, + "tokens_seen": 939923456 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003612136409227683, + "loss": 2.93, + "theoretical_loss": 3.6708721843129375, + "tokens_seen": 939988992 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003612036108324975, + "loss": 2.9799, + "theoretical_loss": 3.670847549677241, + "tokens_seen": 940054528 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611935807422267, + "loss": 2.8914, + "theoretical_loss": 3.6708229172397266, + "tokens_seen": 940120064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036118355065195584, + "loss": 2.8613, + "theoretical_loss": 3.6707982870000446, + "tokens_seen": 940185600 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036117352056168507, + "loss": 2.954, + "theoretical_loss": 3.670773658957846, + "tokens_seen": 940251136 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611634904714142, + "loss": 2.8908, + "theoretical_loss": 3.6707490331127812, + "tokens_seen": 940316672 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036115346038114343, + "loss": 2.9294, + "theoretical_loss": 3.6707244094645013, + "tokens_seen": 940382208 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1514286, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9104864597320557, + "objective/train/theoretical_loss": 3.6707120984640467, + "objective/train/tokens_used": 960874976, + "theoretical_loss": 3.6707120984640467, + "tokens_seen": 940414976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611434302908726, + "loss": 3.0568, + "theoretical_loss": 3.6706997880126573, + "tokens_seen": 940447744 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611334002006018, + "loss": 2.8119, + "theoretical_loss": 3.6706751687569, + "tokens_seen": 940513280 + }, + { + "epoch": 2.08, + "learning_rate": 0.000361123370110331, + "loss": 2.6574, + "theoretical_loss": 3.6706505516968813, + "tokens_seen": 940578816 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611133400200602, + "loss": 2.8607, + "theoretical_loss": 3.670625936832251, + "tokens_seen": 940644352 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003611033099297894, + "loss": 2.9343, + "theoretical_loss": 3.6706013241626616, + "tokens_seen": 940709888 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003610932798395186, + "loss": 2.7802, + "theoretical_loss": 3.6705767136877645, + "tokens_seen": 940775424 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036108324974924776, + "loss": 2.9903, + "theoretical_loss": 3.67055210540721, + "tokens_seen": 940840960 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036107321965897694, + "loss": 2.8261, + "theoretical_loss": 3.6705274993206505, + "tokens_seen": 940906496 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003610631895687062, + "loss": 2.8553, + "theoretical_loss": 3.670502895427738, + "tokens_seen": 940972032 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003610531594784353, + "loss": 2.8835, + "theoretical_loss": 3.670478293728123, + "tokens_seen": 941037568 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036104312938816454, + "loss": 2.8881, + "theoretical_loss": 3.6704536942214574, + "tokens_seen": 941103104 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036103309929789366, + "loss": 2.8972, + "theoretical_loss": 3.670429096907394, + "tokens_seen": 941168640 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003610230692076229, + "loss": 2.8541, + "theoretical_loss": 3.670404501785584, + "tokens_seen": 941234176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003610130391173521, + "loss": 2.9936, + "theoretical_loss": 3.6703799088556797, + "tokens_seen": 941299712 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036100300902708126, + "loss": 2.9166, + "theoretical_loss": 3.670355318117333, + "tokens_seen": 941365248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036099297893681044, + "loss": 2.8758, + "theoretical_loss": 3.670330729570196, + "tokens_seen": 941430784 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003609829488465396, + "loss": 2.9184, + "theoretical_loss": 3.670306143213921, + "tokens_seen": 941496320 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003609729187562688, + "loss": 2.7934, + "theoretical_loss": 3.6702815590481603, + "tokens_seen": 941561856 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036096288866599804, + "loss": 2.8836, + "theoretical_loss": 3.6702569770725666, + "tokens_seen": 941627392 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036095285857572717, + "loss": 3.0061, + "theoretical_loss": 3.670232397286792, + "tokens_seen": 941692928 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003609428284854564, + "loss": 2.9123, + "theoretical_loss": 3.6702078196904884, + "tokens_seen": 941758464 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003609327983951856, + "loss": 2.9099, + "theoretical_loss": 3.67018324428331, + "tokens_seen": 941824000 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036092276830491476, + "loss": 2.8481, + "theoretical_loss": 3.6701586710649083, + "tokens_seen": 941889536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036091273821464394, + "loss": 2.8355, + "theoretical_loss": 3.670134100034937, + "tokens_seen": 941955072 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003609027081243731, + "loss": 2.8666, + "theoretical_loss": 3.670109531193048, + "tokens_seen": 942020608 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1516952, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.901960849761963, + "objective/train/theoretical_loss": 3.6700972475925258, + "objective/train/tokens_used": 962513376, + "theoretical_loss": 3.6700972475925258, + "tokens_seen": 942053376 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003608926780341023, + "loss": 2.8942, + "theoretical_loss": 3.670084964538894, + "tokens_seen": 942086144 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036088264794383154, + "loss": 2.9524, + "theoretical_loss": 3.6700604000721295, + "tokens_seen": 942151680 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036087261785356067, + "loss": 3.0878, + "theoretical_loss": 3.6700358377924065, + "tokens_seen": 942217216 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003608625877632899, + "loss": 2.8706, + "theoretical_loss": 3.6700112776993787, + "tokens_seen": 942282752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036085255767301903, + "loss": 2.8126, + "theoretical_loss": 3.669986719792699, + "tokens_seen": 942348288 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036084252758274827, + "loss": 3.088, + "theoretical_loss": 3.669962164072021, + "tokens_seen": 942413824 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036083249749247745, + "loss": 2.8398, + "theoretical_loss": 3.669937610536998, + "tokens_seen": 942479360 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036082246740220663, + "loss": 3.0346, + "theoretical_loss": 3.6699130591872833, + "tokens_seen": 942544896 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003608124373119358, + "loss": 2.8228, + "theoretical_loss": 3.6698885100225302, + "tokens_seen": 942610432 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036080240722166505, + "loss": 2.9328, + "theoretical_loss": 3.6698639630423937, + "tokens_seen": 942675968 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036079237713139417, + "loss": 2.8696, + "theoretical_loss": 3.669839418246527, + "tokens_seen": 942741504 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003607823470411234, + "loss": 2.9298, + "theoretical_loss": 3.6698148756345823, + "tokens_seen": 942807040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036077231695085253, + "loss": 3.014, + "theoretical_loss": 3.6697903352062156, + "tokens_seen": 942872576 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036076228686058177, + "loss": 3.0465, + "theoretical_loss": 3.6697657969610793, + "tokens_seen": 942938112 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036075225677031095, + "loss": 2.7863, + "theoretical_loss": 3.669741260898829, + "tokens_seen": 943003648 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036074222668004013, + "loss": 2.9502, + "theoretical_loss": 3.6697167270191176, + "tokens_seen": 943069184 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003607321965897693, + "loss": 2.8131, + "theoretical_loss": 3.6696921953215997, + "tokens_seen": 943134720 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003607221664994985, + "loss": 2.8452, + "theoretical_loss": 3.66966766580593, + "tokens_seen": 943200256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003607121364092277, + "loss": 2.8535, + "theoretical_loss": 3.669643138471762, + "tokens_seen": 943265792 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003607021063189569, + "loss": 2.8909, + "theoretical_loss": 3.669618613318751, + "tokens_seen": 943331328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036069207622868604, + "loss": 2.9072, + "theoretical_loss": 3.6695940903465507, + "tokens_seen": 943396864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036068204613841527, + "loss": 2.8722, + "theoretical_loss": 3.6695695695548167, + "tokens_seen": 943462400 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003606720160481444, + "loss": 2.9505, + "theoretical_loss": 3.669545050943203, + "tokens_seen": 943527936 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036066198595787364, + "loss": 2.9337, + "theoretical_loss": 3.6695205345113644, + "tokens_seen": 943593472 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003606519558676028, + "loss": 3.0416, + "theoretical_loss": 3.6694960202589555, + "tokens_seen": 943659008 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1519824, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0098798274993896, + "objective/train/theoretical_loss": 3.6694837639499296, + "objective/train/tokens_used": 964151776, + "theoretical_loss": 3.6694837639499296, + "tokens_seen": 943691776 + }, + { + "epoch": 2.08, + "learning_rate": 0.000360641925777332, + "loss": 2.9854, + "theoretical_loss": 3.669471508185632, + "tokens_seen": 943724544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003606318956870612, + "loss": 2.738, + "theoretical_loss": 3.6694469982910483, + "tokens_seen": 943790080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003606218655967904, + "loss": 2.8564, + "theoretical_loss": 3.66942249057486, + "tokens_seen": 943855616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036061183550651954, + "loss": 3.0125, + "theoretical_loss": 3.669397985036721, + "tokens_seen": 943921152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003606018054162488, + "loss": 2.9212, + "theoretical_loss": 3.669373481676288, + "tokens_seen": 943986688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605917753259779, + "loss": 2.9379, + "theoretical_loss": 3.669348980493216, + "tokens_seen": 944052224 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036058174523570714, + "loss": 2.7708, + "theoretical_loss": 3.669324481487159, + "tokens_seen": 944117760 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605717151454363, + "loss": 2.944, + "theoretical_loss": 3.6692999846577745, + "tokens_seen": 944183296 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605616850551655, + "loss": 2.7101, + "theoretical_loss": 3.669275490004717, + "tokens_seen": 944248832 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605516549648947, + "loss": 2.9784, + "theoretical_loss": 3.669250997527642, + "tokens_seen": 944314368 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036054162487462386, + "loss": 2.9071, + "theoretical_loss": 3.669226507226206, + "tokens_seen": 944379904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036053159478435304, + "loss": 2.9072, + "theoretical_loss": 3.669202019100063, + "tokens_seen": 944445440 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605215646940823, + "loss": 2.9703, + "theoretical_loss": 3.669177533148871, + "tokens_seen": 944510976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003605115346038114, + "loss": 2.8699, + "theoretical_loss": 3.6691530493722855, + "tokens_seen": 944576512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036050150451354064, + "loss": 2.9456, + "theoretical_loss": 3.6691285677699614, + "tokens_seen": 944642048 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036049147442326977, + "loss": 2.9198, + "theoretical_loss": 3.6691040883415553, + "tokens_seen": 944707584 + }, + { + "epoch": 2.08, + "learning_rate": 0.000360481444332999, + "loss": 2.8634, + "theoretical_loss": 3.6690796110867234, + "tokens_seen": 944773120 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003604714142427282, + "loss": 2.8589, + "theoretical_loss": 3.669055136005123, + "tokens_seen": 944838656 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036046138415245737, + "loss": 2.8586, + "theoretical_loss": 3.6690306630964082, + "tokens_seen": 944904192 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036045135406218655, + "loss": 2.9397, + "theoretical_loss": 3.6690061923602375, + "tokens_seen": 944969728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003604413239719158, + "loss": 2.8523, + "theoretical_loss": 3.668981723796266, + "tokens_seen": 945035264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003604312938816449, + "loss": 2.888, + "theoretical_loss": 3.668957257404151, + "tokens_seen": 945100800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036042126379137414, + "loss": 2.9239, + "theoretical_loss": 3.66893279318355, + "tokens_seen": 945166336 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036041123370110327, + "loss": 2.993, + "theoretical_loss": 3.6689083311341175, + "tokens_seen": 945231872 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003604012036108325, + "loss": 2.8188, + "theoretical_loss": 3.668883871255512, + "tokens_seen": 945297408 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1521316, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.009571075439453, + "objective/train/theoretical_loss": 3.6688716421301617, + "objective/train/tokens_used": 965790176, + "theoretical_loss": 3.6688716421301617, + "tokens_seen": 945330176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003603911735205617, + "loss": 2.8307, + "theoretical_loss": 3.6688594135473895, + "tokens_seen": 945362944 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036038114343029087, + "loss": 2.9999, + "theoretical_loss": 3.6688349580094073, + "tokens_seen": 945428480 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036037111334002005, + "loss": 2.8464, + "theoretical_loss": 3.6688105046412227, + "tokens_seen": 945494016 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036036108324974923, + "loss": 3.0407, + "theoretical_loss": 3.668786053442493, + "tokens_seen": 945559552 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036035105315947847, + "loss": 2.9085, + "theoretical_loss": 3.6687616044128744, + "tokens_seen": 945625088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036034102306920765, + "loss": 2.856, + "theoretical_loss": 3.6687371575520245, + "tokens_seen": 945690624 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036033099297893683, + "loss": 2.7814, + "theoretical_loss": 3.668712712859601, + "tokens_seen": 945756160 + }, + { + "epoch": 2.08, + "learning_rate": 0.000360320962888666, + "loss": 2.9499, + "theoretical_loss": 3.6686882703352612, + "tokens_seen": 945821696 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036031093279839525, + "loss": 2.8534, + "theoretical_loss": 3.668663829978663, + "tokens_seen": 945887232 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036030090270812437, + "loss": 2.947, + "theoretical_loss": 3.668639391789463, + "tokens_seen": 945952768 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003602908726178536, + "loss": 2.805, + "theoretical_loss": 3.66861495576732, + "tokens_seen": 946018304 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036028084252758273, + "loss": 3.0405, + "theoretical_loss": 3.6685905219118906, + "tokens_seen": 946083840 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036027081243731197, + "loss": 2.9719, + "theoretical_loss": 3.6685660902228334, + "tokens_seen": 946149376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036026078234704115, + "loss": 3.0295, + "theoretical_loss": 3.668541660699806, + "tokens_seen": 946214912 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036025075225677033, + "loss": 2.8233, + "theoretical_loss": 3.668517233342466, + "tokens_seen": 946280448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003602407221664995, + "loss": 2.845, + "theoretical_loss": 3.6684928081504724, + "tokens_seen": 946345984 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003602306920762287, + "loss": 2.836, + "theoretical_loss": 3.6684683851234823, + "tokens_seen": 946411520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003602206619859579, + "loss": 2.9118, + "theoretical_loss": 3.668443964261155, + "tokens_seen": 946477056 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003602106318956871, + "loss": 2.9632, + "theoretical_loss": 3.6684195455631476, + "tokens_seen": 946542592 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036020060180541624, + "loss": 2.9406, + "theoretical_loss": 3.6683951290291192, + "tokens_seen": 946608128 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601905717151455, + "loss": 2.9088, + "theoretical_loss": 3.6683707146587277, + "tokens_seen": 946673664 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601805416248746, + "loss": 2.9394, + "theoretical_loss": 3.6683463024516323, + "tokens_seen": 946739200 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036017051153460384, + "loss": 2.8184, + "theoretical_loss": 3.6683218924074907, + "tokens_seen": 946804736 + }, + { + "epoch": 2.08, + "learning_rate": 0.000360160481444333, + "loss": 2.8182, + "theoretical_loss": 3.668297484525963, + "tokens_seen": 946870272 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601504513540622, + "loss": 3.0151, + "theoretical_loss": 3.668273078806706, + "tokens_seen": 946935808 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1524063, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9343068599700928, + "objective/train/theoretical_loss": 3.6682608767578233, + "objective/train/tokens_used": 967428576, + "theoretical_loss": 3.6682608767578233, + "tokens_seen": 946968576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601404212637914, + "loss": 2.7943, + "theoretical_loss": 3.66824867524938, + "tokens_seen": 947001344 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601303911735206, + "loss": 3.0135, + "theoretical_loss": 3.668224273853644, + "tokens_seen": 947066880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036012036108324974, + "loss": 2.9198, + "theoretical_loss": 3.6681998746191553, + "tokens_seen": 947132416 + }, + { + "epoch": 2.08, + "learning_rate": 0.000360110330992979, + "loss": 2.956, + "theoretical_loss": 3.668175477545575, + "tokens_seen": 947197952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003601003009027081, + "loss": 2.7839, + "theoretical_loss": 3.668151082632561, + "tokens_seen": 947263488 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036009027081243734, + "loss": 2.9279, + "theoretical_loss": 3.668126689879773, + "tokens_seen": 947329024 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003600802407221665, + "loss": 2.9224, + "theoretical_loss": 3.6681022992868693, + "tokens_seen": 947394560 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003600702106318957, + "loss": 2.8601, + "theoretical_loss": 3.6680779108535106, + "tokens_seen": 947460096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003600601805416249, + "loss": 2.8941, + "theoretical_loss": 3.6680535245793555, + "tokens_seen": 947525632 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036005015045135406, + "loss": 2.9618, + "theoretical_loss": 3.668029140464064, + "tokens_seen": 947591168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036004012036108324, + "loss": 3.095, + "theoretical_loss": 3.6680047585072955, + "tokens_seen": 947656704 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003600300902708125, + "loss": 2.8911, + "theoretical_loss": 3.6679803787087097, + "tokens_seen": 947722240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003600200601805416, + "loss": 2.9849, + "theoretical_loss": 3.6679560010679664, + "tokens_seen": 947787776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00036001003009027084, + "loss": 2.9079, + "theoretical_loss": 3.6679316255847247, + "tokens_seen": 947853312 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035999999999999997, + "loss": 2.9141, + "theoretical_loss": 3.6679072522586456, + "tokens_seen": 947918848 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003599899699097292, + "loss": 2.8815, + "theoretical_loss": 3.6678828810893886, + "tokens_seen": 947984384 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003599799398194584, + "loss": 2.973, + "theoretical_loss": 3.667858512076614, + "tokens_seen": 948049920 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035996990972918757, + "loss": 2.8918, + "theoretical_loss": 3.667834145219981, + "tokens_seen": 948115456 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035995987963891675, + "loss": 2.8518, + "theoretical_loss": 3.667809780519151, + "tokens_seen": 948180992 + }, + { + "epoch": 2.08, + "learning_rate": 0.000359949849548646, + "loss": 3.0465, + "theoretical_loss": 3.667785417973784, + "tokens_seen": 948246528 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003599398194583751, + "loss": 2.7659, + "theoretical_loss": 3.6677610575835393, + "tokens_seen": 948312064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035992978936810434, + "loss": 2.8949, + "theoretical_loss": 3.6677366993480787, + "tokens_seen": 948377600 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035991975927783347, + "loss": 2.8779, + "theoretical_loss": 3.6677123432670626, + "tokens_seen": 948443136 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003599097291875627, + "loss": 2.9694, + "theoretical_loss": 3.66768798934015, + "tokens_seen": 948508672 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003598996990972919, + "loss": 2.9063, + "theoretical_loss": 3.667663637567004, + "tokens_seen": 948574208 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1526876, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6691465377807617, + "objective/train/theoretical_loss": 3.667651462487987, + "objective/train/tokens_used": 969066976, + "theoretical_loss": 3.667651462487987, + "tokens_seen": 948606976 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035988966900702107, + "loss": 2.8067, + "theoretical_loss": 3.6676392879472837, + "tokens_seen": 948639744 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035987963891675025, + "loss": 2.7822, + "theoretical_loss": 3.66761494048065, + "tokens_seen": 948705280 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035986960882647943, + "loss": 2.9408, + "theoretical_loss": 3.667590595166765, + "tokens_seen": 948770816 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003598595787362086, + "loss": 2.9706, + "theoretical_loss": 3.667566252005288, + "tokens_seen": 948836352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035984954864593785, + "loss": 2.9058, + "theoretical_loss": 3.6675419109958813, + "tokens_seen": 948901888 + }, + { + "epoch": 2.08, + "learning_rate": 0.000359839518555667, + "loss": 3.1064, + "theoretical_loss": 3.6675175721382054, + "tokens_seen": 948967424 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003598294884653962, + "loss": 2.7847, + "theoretical_loss": 3.6674932354319214, + "tokens_seen": 949032960 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035981945837512534, + "loss": 2.9186, + "theoretical_loss": 3.6674689008766914, + "tokens_seen": 949098496 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035980942828485457, + "loss": 2.9564, + "theoretical_loss": 3.6674445684721766, + "tokens_seen": 949164032 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035979939819458375, + "loss": 2.9421, + "theoretical_loss": 3.667420238218037, + "tokens_seen": 949229568 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035978936810431293, + "loss": 2.8335, + "theoretical_loss": 3.667395910113936, + "tokens_seen": 949295104 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003597793380140421, + "loss": 2.9535, + "theoretical_loss": 3.6673715841595342, + "tokens_seen": 949360640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035976930792377135, + "loss": 2.9736, + "theoretical_loss": 3.667347260354494, + "tokens_seen": 949426176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003597592778335005, + "loss": 2.8327, + "theoretical_loss": 3.667322938698476, + "tokens_seen": 949491712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003597492477432297, + "loss": 2.8929, + "theoretical_loss": 3.6672986191911425, + "tokens_seen": 949557248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035973921765295884, + "loss": 2.8962, + "theoretical_loss": 3.6672743018321556, + "tokens_seen": 949622784 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003597291875626881, + "loss": 2.8117, + "theoretical_loss": 3.6672499866211776, + "tokens_seen": 949688320 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035971915747241726, + "loss": 2.9746, + "theoretical_loss": 3.66722567355787, + "tokens_seen": 949753856 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035970912738214644, + "loss": 2.8694, + "theoretical_loss": 3.6672013626418956, + "tokens_seen": 949819392 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596990972918756, + "loss": 2.9765, + "theoretical_loss": 3.6671770538729156, + "tokens_seen": 949884928 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596890672016048, + "loss": 3.0187, + "theoretical_loss": 3.6671527472505927, + "tokens_seen": 949950464 + }, + { + "epoch": 2.08, + "learning_rate": 0.000359679037111334, + "loss": 2.8771, + "theoretical_loss": 3.6671284427745894, + "tokens_seen": 950016000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596690070210632, + "loss": 2.9445, + "theoretical_loss": 3.667104140444568, + "tokens_seen": 950081536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035965897693079234, + "loss": 3.021, + "theoretical_loss": 3.667079840260191, + "tokens_seen": 950147072 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596489468405216, + "loss": 3.0327, + "theoretical_loss": 3.6670555422211213, + "tokens_seen": 950212608 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1529664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.735762596130371, + "objective/train/theoretical_loss": 3.6670433940059715, + "objective/train/tokens_used": 970705376, + "theoretical_loss": 3.6670433940059715, + "tokens_seen": 950245376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035963891675025076, + "loss": 2.8314, + "theoretical_loss": 3.6670312463270216, + "tokens_seen": 950278144 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035962888665997994, + "loss": 2.7663, + "theoretical_loss": 3.6670069525775535, + "tokens_seen": 950343680 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596188565697091, + "loss": 3.0313, + "theoretical_loss": 3.6669826609723817, + "tokens_seen": 950409216 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003596088264794383, + "loss": 2.9479, + "theoretical_loss": 3.6669583715111678, + "tokens_seen": 950474752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035959879638916754, + "loss": 2.9453, + "theoretical_loss": 3.6669340841935747, + "tokens_seen": 950540288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003595887662988967, + "loss": 2.8347, + "theoretical_loss": 3.666909799019266, + "tokens_seen": 950605824 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003595787362086259, + "loss": 2.9151, + "theoretical_loss": 3.6668855159879046, + "tokens_seen": 950671360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003595687061183551, + "loss": 3.0129, + "theoretical_loss": 3.6668612350991534, + "tokens_seen": 950736896 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035955867602808426, + "loss": 2.8061, + "theoretical_loss": 3.666836956352676, + "tokens_seen": 950802432 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035954864593781344, + "loss": 2.8544, + "theoretical_loss": 3.666812679748136, + "tokens_seen": 950867968 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003595386158475427, + "loss": 2.8357, + "theoretical_loss": 3.666788405285197, + "tokens_seen": 950933504 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003595285857572718, + "loss": 2.8009, + "theoretical_loss": 3.6667641329635217, + "tokens_seen": 950999040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035951855566700104, + "loss": 2.9501, + "theoretical_loss": 3.666739862782774, + "tokens_seen": 951064576 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035950852557673017, + "loss": 2.7728, + "theoretical_loss": 3.6667155947426173, + "tokens_seen": 951130112 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594984954864594, + "loss": 2.959, + "theoretical_loss": 3.666691328842716, + "tokens_seen": 951195648 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594884653961886, + "loss": 2.8202, + "theoretical_loss": 3.6666670650827333, + "tokens_seen": 951261184 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035947843530591777, + "loss": 2.8179, + "theoretical_loss": 3.6666428034623335, + "tokens_seen": 951326720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035946840521564695, + "loss": 3.0591, + "theoretical_loss": 3.66661854398118, + "tokens_seen": 951392256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594583751253762, + "loss": 2.8092, + "theoretical_loss": 3.6665942866389374, + "tokens_seen": 951457792 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594483450351053, + "loss": 3.0225, + "theoretical_loss": 3.6665700314352696, + "tokens_seen": 951523328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035943831494483454, + "loss": 3.0263, + "theoretical_loss": 3.6665457783698407, + "tokens_seen": 951588864 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035942828485456367, + "loss": 2.9207, + "theoretical_loss": 3.666521527442315, + "tokens_seen": 951654400 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594182547642929, + "loss": 2.7609, + "theoretical_loss": 3.666497278652357, + "tokens_seen": 951719936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003594082246740221, + "loss": 2.9167, + "theoretical_loss": 3.6664730319996313, + "tokens_seen": 951785472 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035939819458375127, + "loss": 2.9695, + "theoretical_loss": 3.6664487874838017, + "tokens_seen": 951851008 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1532237, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.656838893890381, + "objective/train/theoretical_loss": 3.666436666027118, + "objective/train/tokens_used": 972343776, + "theoretical_loss": 3.666436666027118, + "tokens_seen": 951883776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035938816449348045, + "loss": 2.8376, + "theoretical_loss": 3.666424545104533, + "tokens_seen": 951916544 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035937813440320963, + "loss": 2.8193, + "theoretical_loss": 3.66640030486149, + "tokens_seen": 951982080 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003593681043129388, + "loss": 2.8426, + "theoretical_loss": 3.6663760667543377, + "tokens_seen": 952047616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035935807422266805, + "loss": 2.7242, + "theoretical_loss": 3.666351830782741, + "tokens_seen": 952113152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003593480441323972, + "loss": 2.935, + "theoretical_loss": 3.6663275969463633, + "tokens_seen": 952178688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003593380140421264, + "loss": 2.8129, + "theoretical_loss": 3.6663033652448713, + "tokens_seen": 952244224 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035932798395185554, + "loss": 2.9916, + "theoretical_loss": 3.666279135677929, + "tokens_seen": 952309760 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035931795386158477, + "loss": 2.8945, + "theoretical_loss": 3.666254908245202, + "tokens_seen": 952375296 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035930792377131395, + "loss": 3.0055, + "theoretical_loss": 3.6662306829463556, + "tokens_seen": 952440832 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035929789368104313, + "loss": 2.7959, + "theoretical_loss": 3.666206459781054, + "tokens_seen": 952506368 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003592878635907723, + "loss": 2.8205, + "theoretical_loss": 3.6661822387489638, + "tokens_seen": 952571904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035927783350050155, + "loss": 2.877, + "theoretical_loss": 3.6661580198497496, + "tokens_seen": 952637440 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003592678034102307, + "loss": 2.9189, + "theoretical_loss": 3.666133803083077, + "tokens_seen": 952702976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003592577733199599, + "loss": 2.9301, + "theoretical_loss": 3.666109588448612, + "tokens_seen": 952768512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035924774322968904, + "loss": 2.7287, + "theoretical_loss": 3.6660853759460195, + "tokens_seen": 952834048 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003592377131394183, + "loss": 3.0176, + "theoretical_loss": 3.6660611655749658, + "tokens_seen": 952899584 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035922768304914746, + "loss": 2.8801, + "theoretical_loss": 3.666036957335116, + "tokens_seen": 952965120 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035921765295887664, + "loss": 2.9994, + "theoretical_loss": 3.666012751226137, + "tokens_seen": 953030656 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003592076228686058, + "loss": 2.7921, + "theoretical_loss": 3.665988547247694, + "tokens_seen": 953096192 + }, + { + "epoch": 2.08, + "learning_rate": 0.000359197592778335, + "loss": 3.0012, + "theoretical_loss": 3.665964345399453, + "tokens_seen": 953161728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591875626880642, + "loss": 2.9291, + "theoretical_loss": 3.66594014568108, + "tokens_seen": 953227264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591775325977934, + "loss": 2.9184, + "theoretical_loss": 3.665915948092242, + "tokens_seen": 953292800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035916750250752254, + "loss": 2.8042, + "theoretical_loss": 3.665891752632604, + "tokens_seen": 953358336 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591574724172518, + "loss": 2.8349, + "theoretical_loss": 3.665867559301833, + "tokens_seen": 953423872 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035914744232698096, + "loss": 2.8043, + "theoretical_loss": 3.665843368099595, + "tokens_seen": 953489408 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1535056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1286439895629883, + "objective/train/theoretical_loss": 3.665831273296572, + "objective/train/tokens_used": 973982176, + "theoretical_loss": 3.665831273296572, + "tokens_seen": 953522176 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035913741223671014, + "loss": 3.0879, + "theoretical_loss": 3.665819179025557, + "tokens_seen": 953554944 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591273821464393, + "loss": 2.8745, + "theoretical_loss": 3.665794992079385, + "tokens_seen": 953620480 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591173520561685, + "loss": 2.9444, + "theoretical_loss": 3.665770807260746, + "tokens_seen": 953686016 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003591073219658977, + "loss": 2.9877, + "theoretical_loss": 3.6657466245693064, + "tokens_seen": 953751552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003590972918756269, + "loss": 2.6837, + "theoretical_loss": 3.665722444004733, + "tokens_seen": 953817088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035908726178535605, + "loss": 2.6699, + "theoretical_loss": 3.665698265566693, + "tokens_seen": 953882624 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003590772316950853, + "loss": 2.7927, + "theoretical_loss": 3.6656740892548525, + "tokens_seen": 953948160 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003590672016048144, + "loss": 3.0223, + "theoretical_loss": 3.6656499150688795, + "tokens_seen": 954013696 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035905717151454364, + "loss": 2.8822, + "theoretical_loss": 3.66562574300844, + "tokens_seen": 954079232 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003590471414242728, + "loss": 2.7908, + "theoretical_loss": 3.665601573073202, + "tokens_seen": 954144768 + }, + { + "epoch": 2.08, + "learning_rate": 0.000359037111334002, + "loss": 2.8326, + "theoretical_loss": 3.665577405262832, + "tokens_seen": 954210304 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003590270812437312, + "loss": 2.8108, + "theoretical_loss": 3.665553239576998, + "tokens_seen": 954275840 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035901705115346037, + "loss": 2.7302, + "theoretical_loss": 3.6655290760153663, + "tokens_seen": 954341376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035900702106318955, + "loss": 2.9556, + "theoretical_loss": 3.6655049145776055, + "tokens_seen": 954406912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589969909729188, + "loss": 2.8804, + "theoretical_loss": 3.6654807552633826, + "tokens_seen": 954472448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589869608826479, + "loss": 2.9689, + "theoretical_loss": 3.665456598072365, + "tokens_seen": 954537984 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035897693079237715, + "loss": 2.8171, + "theoretical_loss": 3.6654324430042204, + "tokens_seen": 954603520 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035896690070210633, + "loss": 2.9126, + "theoretical_loss": 3.665408290058617, + "tokens_seen": 954669056 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589568706118355, + "loss": 2.748, + "theoretical_loss": 3.665384139235222, + "tokens_seen": 954734592 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589468405215647, + "loss": 2.8632, + "theoretical_loss": 3.6653599905337035, + "tokens_seen": 954800128 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035893681043129387, + "loss": 2.895, + "theoretical_loss": 3.6653358439537294, + "tokens_seen": 954865664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035892678034102305, + "loss": 3.1184, + "theoretical_loss": 3.6653116994949677, + "tokens_seen": 954931200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589167502507523, + "loss": 2.7884, + "theoretical_loss": 3.665287557157087, + "tokens_seen": 954996736 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003589067201604814, + "loss": 2.8525, + "theoretical_loss": 3.6652634169397547, + "tokens_seen": 955062272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035889669007021065, + "loss": 2.8768, + "theoretical_loss": 3.6652392788426393, + "tokens_seen": 955127808 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1537778, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.771440267562866, + "objective/train/theoretical_loss": 3.6652272105890598, + "objective/train/tokens_used": 975620576, + "theoretical_loss": 3.6652272105890598, + "tokens_seen": 955160576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003588866599799398, + "loss": 2.8008, + "theoretical_loss": 3.66521514286541, + "tokens_seen": 955193344 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358876629889669, + "loss": 2.8309, + "theoretical_loss": 3.6651910090077333, + "tokens_seen": 955258880 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003588665997993982, + "loss": 2.8331, + "theoretical_loss": 3.6651668772692796, + "tokens_seen": 955324416 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003588565697091274, + "loss": 2.9673, + "theoretical_loss": 3.6651427476497167, + "tokens_seen": 955389952 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003588465396188566, + "loss": 2.9484, + "theoretical_loss": 3.6651186201487125, + "tokens_seen": 955455488 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035883650952858574, + "loss": 2.8721, + "theoretical_loss": 3.6650944947659374, + "tokens_seen": 955521024 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035882647943831497, + "loss": 2.9229, + "theoretical_loss": 3.6650703715010584, + "tokens_seen": 955586560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035881644934804415, + "loss": 2.9318, + "theoretical_loss": 3.665046250353745, + "tokens_seen": 955652096 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035880641925777333, + "loss": 2.8425, + "theoretical_loss": 3.6650221313236666, + "tokens_seen": 955717632 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003587963891675025, + "loss": 3.0306, + "theoretical_loss": 3.6649980144104917, + "tokens_seen": 955783168 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035878635907723175, + "loss": 3.022, + "theoretical_loss": 3.6649738996138894, + "tokens_seen": 955848704 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003587763289869609, + "loss": 2.7621, + "theoretical_loss": 3.664949786933529, + "tokens_seen": 955914240 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003587662988966901, + "loss": 2.854, + "theoretical_loss": 3.664925676369079, + "tokens_seen": 955979776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035875626880641924, + "loss": 2.8541, + "theoretical_loss": 3.6649015679202104, + "tokens_seen": 956045312 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003587462387161485, + "loss": 2.8758, + "theoretical_loss": 3.664877461586591, + "tokens_seen": 956110848 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035873620862587766, + "loss": 2.8028, + "theoretical_loss": 3.6648533573678908, + "tokens_seen": 956176384 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035872617853560684, + "loss": 2.864, + "theoretical_loss": 3.664829255263779, + "tokens_seen": 956241920 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358716148445336, + "loss": 2.847, + "theoretical_loss": 3.6648051552739256, + "tokens_seen": 956307456 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003587061183550652, + "loss": 2.972, + "theoretical_loss": 3.664781057398, + "tokens_seen": 956372992 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586960882647944, + "loss": 2.8988, + "theoretical_loss": 3.664756961635672, + "tokens_seen": 956438528 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586860581745236, + "loss": 3.0611, + "theoretical_loss": 3.6647328679866122, + "tokens_seen": 956504064 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035867602808425274, + "loss": 2.6127, + "theoretical_loss": 3.664708776450489, + "tokens_seen": 956569600 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358665997993982, + "loss": 2.843, + "theoretical_loss": 3.6646846870269734, + "tokens_seen": 956635136 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035865596790371116, + "loss": 2.8737, + "theoretical_loss": 3.664660599715735, + "tokens_seen": 956700672 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035864593781344034, + "loss": 2.9028, + "theoretical_loss": 3.6646365145164435, + "tokens_seen": 956766208 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1539261, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6032583713531494, + "objective/train/theoretical_loss": 3.6646244727086756, + "objective/train/tokens_used": 977258976, + "theoretical_loss": 3.6646244727086756, + "tokens_seen": 956798976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586359077231695, + "loss": 2.7049, + "theoretical_loss": 3.6646124314287705, + "tokens_seen": 956831744 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586258776328987, + "loss": 2.9082, + "theoretical_loss": 3.664588350452385, + "tokens_seen": 956897280 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586158475426279, + "loss": 2.812, + "theoretical_loss": 3.6645642715869577, + "tokens_seen": 956962816 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003586058174523571, + "loss": 3.0367, + "theoretical_loss": 3.664540194832159, + "tokens_seen": 957028352 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035859578736208625, + "loss": 3.0805, + "theoretical_loss": 3.6645161201876597, + "tokens_seen": 957093888 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003585857572718155, + "loss": 2.9126, + "theoretical_loss": 3.6644920476531295, + "tokens_seen": 957159424 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003585757271815446, + "loss": 2.9391, + "theoretical_loss": 3.66446797722824, + "tokens_seen": 957224960 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035856569709127384, + "loss": 2.9683, + "theoretical_loss": 3.664443908912661, + "tokens_seen": 957290496 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358555667001003, + "loss": 2.8851, + "theoretical_loss": 3.6644198427060646, + "tokens_seen": 957356032 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003585456369107322, + "loss": 2.789, + "theoretical_loss": 3.66439577860812, + "tokens_seen": 957421568 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003585356068204614, + "loss": 2.8929, + "theoretical_loss": 3.6643717166184993, + "tokens_seen": 957487104 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035852557673019057, + "loss": 2.9749, + "theoretical_loss": 3.664347656736873, + "tokens_seen": 957552640 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035851554663991975, + "loss": 2.9593, + "theoretical_loss": 3.6643235989629126, + "tokens_seen": 957618176 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358505516549649, + "loss": 2.7488, + "theoretical_loss": 3.6642995432962886, + "tokens_seen": 957683712 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003584954864593781, + "loss": 2.85, + "theoretical_loss": 3.6642754897366725, + "tokens_seen": 957749248 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035848545636910735, + "loss": 2.956, + "theoretical_loss": 3.664251438283736, + "tokens_seen": 957814784 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035847542627883653, + "loss": 2.8743, + "theoretical_loss": 3.66422738893715, + "tokens_seen": 957880320 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003584653961885657, + "loss": 2.8319, + "theoretical_loss": 3.6642033416965862, + "tokens_seen": 957945856 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003584553660982949, + "loss": 2.8268, + "theoretical_loss": 3.6641792965617155, + "tokens_seen": 958011392 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035844533600802407, + "loss": 2.751, + "theoretical_loss": 3.6641552535322104, + "tokens_seen": 958076928 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035843530591775325, + "loss": 2.9361, + "theoretical_loss": 3.664131212607742, + "tokens_seen": 958142464 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003584252758274825, + "loss": 2.9218, + "theoretical_loss": 3.664107173787982, + "tokens_seen": 958208000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003584152457372116, + "loss": 2.8079, + "theoretical_loss": 3.664083137072602, + "tokens_seen": 958273536 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035840521564694085, + "loss": 3.0393, + "theoretical_loss": 3.664059102461275, + "tokens_seen": 958339072 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035839518555667, + "loss": 2.7744, + "theoretical_loss": 3.6640350699536715, + "tokens_seen": 958404608 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1542166, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3026492595672607, + "objective/train/theoretical_loss": 3.6640230544886645, + "objective/train/tokens_used": 978897376, + "theoretical_loss": 3.6640230544886645, + "tokens_seen": 958437376 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583851554663992, + "loss": 2.9993, + "theoretical_loss": 3.6640110395494645, + "tokens_seen": 958470144 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583751253761284, + "loss": 2.9652, + "theoretical_loss": 3.6639870112483264, + "tokens_seen": 958535680 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583650952858576, + "loss": 2.8297, + "theoretical_loss": 3.6639629850499285, + "tokens_seen": 958601216 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035835506519558676, + "loss": 2.7874, + "theoretical_loss": 3.663938960953943, + "tokens_seen": 958666752 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035834503510531594, + "loss": 2.7955, + "theoretical_loss": 3.6639149389600427, + "tokens_seen": 958732288 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583350050150451, + "loss": 2.8972, + "theoretical_loss": 3.6638909190678994, + "tokens_seen": 958797824 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035832497492477435, + "loss": 2.742, + "theoretical_loss": 3.6638669012771867, + "tokens_seen": 958863360 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583149448345035, + "loss": 2.9961, + "theoretical_loss": 3.6638428855875764, + "tokens_seen": 958928896 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003583049147442327, + "loss": 2.8779, + "theoretical_loss": 3.6638188719987417, + "tokens_seen": 958994432 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003582948846539619, + "loss": 2.713, + "theoretical_loss": 3.663794860510354, + "tokens_seen": 959059968 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003582848545636911, + "loss": 2.691, + "theoretical_loss": 3.663770851122087, + "tokens_seen": 959125504 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035827482447342026, + "loss": 2.9775, + "theoretical_loss": 3.6637468438336134, + "tokens_seen": 959191040 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035826479438314944, + "loss": 2.9257, + "theoretical_loss": 3.663722838644606, + "tokens_seen": 959256576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003582547642928786, + "loss": 2.8699, + "theoretical_loss": 3.6636988355547384, + "tokens_seen": 959322112 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035824473420260786, + "loss": 2.8904, + "theoretical_loss": 3.6636748345636825, + "tokens_seen": 959387648 + }, + { + "epoch": 2.08, + "learning_rate": 0.000358234704112337, + "loss": 2.7472, + "theoretical_loss": 3.6636508356711124, + "tokens_seen": 959453184 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003582246740220662, + "loss": 2.8745, + "theoretical_loss": 3.6636268388767013, + "tokens_seen": 959518720 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035821464393179535, + "loss": 2.9343, + "theoretical_loss": 3.663602844180122, + "tokens_seen": 959584256 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003582046138415246, + "loss": 2.8867, + "theoretical_loss": 3.6635788515810477, + "tokens_seen": 959649792 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035819458375125376, + "loss": 2.9513, + "theoretical_loss": 3.663554861079152, + "tokens_seen": 959715328 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035818455366098294, + "loss": 2.8466, + "theoretical_loss": 3.6635308726741087, + "tokens_seen": 959780864 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003581745235707121, + "loss": 2.7372, + "theoretical_loss": 3.6635068863655915, + "tokens_seen": 959846400 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035816449348044136, + "loss": 2.8541, + "theoretical_loss": 3.663482902153273, + "tokens_seen": 959911936 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003581544633901705, + "loss": 2.9138, + "theoretical_loss": 3.6634589200368284, + "tokens_seen": 959977472 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003581444332998997, + "loss": 2.977, + "theoretical_loss": 3.6634349400159305, + "tokens_seen": 960043008 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1544419, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7253384590148926, + "objective/train/theoretical_loss": 3.6634229507912095, + "objective/train/tokens_used": 980535776, + "theoretical_loss": 3.6634229507912095, + "tokens_seen": 960075776 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035813440320962885, + "loss": 2.9462, + "theoretical_loss": 3.6634109620902535, + "tokens_seen": 960108544 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003581243731193581, + "loss": 2.9468, + "theoretical_loss": 3.6633869862594706, + "tokens_seen": 960174080 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035811434302908727, + "loss": 2.8271, + "theoretical_loss": 3.663363012523257, + "tokens_seen": 960239616 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035810431293881645, + "loss": 3.0577, + "theoretical_loss": 3.663339040881286, + "tokens_seen": 960305152 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580942828485457, + "loss": 2.8129, + "theoretical_loss": 3.6633150713332325, + "tokens_seen": 960370688 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580842527582748, + "loss": 2.9438, + "theoretical_loss": 3.6632911038787697, + "tokens_seen": 960436224 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035807422266800404, + "loss": 2.7784, + "theoretical_loss": 3.6632671385175724, + "tokens_seen": 960501760 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580641925777332, + "loss": 2.8891, + "theoretical_loss": 3.663243175249315, + "tokens_seen": 960567296 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580541624874624, + "loss": 2.8832, + "theoretical_loss": 3.663219214073672, + "tokens_seen": 960632832 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580441323971916, + "loss": 2.8235, + "theoretical_loss": 3.6631952549903177, + "tokens_seen": 960698368 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035803410230692077, + "loss": 2.7497, + "theoretical_loss": 3.663171297998927, + "tokens_seen": 960763904 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035802407221664995, + "loss": 2.9928, + "theoretical_loss": 3.6631473430991743, + "tokens_seen": 960829440 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580140421263792, + "loss": 2.9181, + "theoretical_loss": 3.663123390290734, + "tokens_seen": 960894976 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003580040120361083, + "loss": 3.0563, + "theoretical_loss": 3.6630994395732817, + "tokens_seen": 960960512 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035799398194583755, + "loss": 2.9013, + "theoretical_loss": 3.663075490946492, + "tokens_seen": 961026048 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035798395185556673, + "loss": 2.8464, + "theoretical_loss": 3.66305154441004, + "tokens_seen": 961091584 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003579739217652959, + "loss": 2.9814, + "theoretical_loss": 3.6630275999636, + "tokens_seen": 961157120 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003579638916750251, + "loss": 2.9038, + "theoretical_loss": 3.663003657606848, + "tokens_seen": 961222656 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035795386158475427, + "loss": 2.9078, + "theoretical_loss": 3.662979717339458, + "tokens_seen": 961288192 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035794383149448345, + "loss": 2.9591, + "theoretical_loss": 3.6629557791611065, + "tokens_seen": 961353728 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003579338014042127, + "loss": 2.9257, + "theoretical_loss": 3.662931843071468, + "tokens_seen": 961419264 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003579237713139418, + "loss": 2.9918, + "theoretical_loss": 3.662907909070219, + "tokens_seen": 961484800 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035791374122367105, + "loss": 3.061, + "theoretical_loss": 3.662883977157034, + "tokens_seen": 961550336 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003579037111334002, + "loss": 2.9627, + "theoretical_loss": 3.662860047331588, + "tokens_seen": 961615872 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578936810431294, + "loss": 2.6828, + "theoretical_loss": 3.6628361195935577, + "tokens_seen": 961681408 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1547059, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.095036268234253, + "objective/train/theoretical_loss": 3.662824156507222, + "objective/train/tokens_used": 982174176, + "theoretical_loss": 3.662824156507222, + "tokens_seen": 961714176 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578836509528586, + "loss": 2.9876, + "theoretical_loss": 3.6628121939426186, + "tokens_seen": 961746944 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578736208625878, + "loss": 2.9386, + "theoretical_loss": 3.6627882703784462, + "tokens_seen": 961812480 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035786359077231696, + "loss": 2.9686, + "theoretical_loss": 3.6627643489007164, + "tokens_seen": 961878016 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035785356068204614, + "loss": 2.8716, + "theoretical_loss": 3.662740429509105, + "tokens_seen": 961943552 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578435305917753, + "loss": 2.9323, + "theoretical_loss": 3.6627165122032874, + "tokens_seen": 962009088 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035783350050150455, + "loss": 2.9268, + "theoretical_loss": 3.662692596982941, + "tokens_seen": 962074624 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578234704112337, + "loss": 2.9489, + "theoretical_loss": 3.662668683847741, + "tokens_seen": 962140160 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578134403209629, + "loss": 2.589, + "theoretical_loss": 3.662644772797364, + "tokens_seen": 962205696 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003578034102306921, + "loss": 2.9655, + "theoretical_loss": 3.662620863831486, + "tokens_seen": 962271232 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003577933801404213, + "loss": 2.6946, + "theoretical_loss": 3.662596956949783, + "tokens_seen": 962336768 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035778335005015046, + "loss": 2.8387, + "theoretical_loss": 3.6625730521519326, + "tokens_seen": 962402304 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035777331995987964, + "loss": 2.8806, + "theoretical_loss": 3.6625491494376097, + "tokens_seen": 962467840 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003577632898696088, + "loss": 2.8152, + "theoretical_loss": 3.6625252488064914, + "tokens_seen": 962533376 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035775325977933806, + "loss": 2.8562, + "theoretical_loss": 3.662501350258255, + "tokens_seen": 962598912 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003577432296890672, + "loss": 2.9161, + "theoretical_loss": 3.662477453792577, + "tokens_seen": 962664448 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003577331995987964, + "loss": 2.8701, + "theoretical_loss": 3.6624535594091334, + "tokens_seen": 962729984 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035772316950852555, + "loss": 2.9215, + "theoretical_loss": 3.662429667107602, + "tokens_seen": 962795520 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003577131394182548, + "loss": 2.7292, + "theoretical_loss": 3.662405776887659, + "tokens_seen": 962861056 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035770310932798396, + "loss": 2.9311, + "theoretical_loss": 3.6623818887489814, + "tokens_seen": 962926592 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035769307923771314, + "loss": 2.938, + "theoretical_loss": 3.6623580026912466, + "tokens_seen": 962992128 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003576830491474423, + "loss": 2.8585, + "theoretical_loss": 3.6623341187141314, + "tokens_seen": 963057664 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035767301905717156, + "loss": 2.7501, + "theoretical_loss": 3.6623102368173135, + "tokens_seen": 963123200 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003576629889669007, + "loss": 2.8383, + "theoretical_loss": 3.66228635700047, + "tokens_seen": 963188736 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003576529588766299, + "loss": 2.8458, + "theoretical_loss": 3.6622624792632776, + "tokens_seen": 963254272 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035764292878635905, + "loss": 2.8796, + "theoretical_loss": 3.6622386036054144, + "tokens_seen": 963319808 + }, + { + "epoch": 2.08, + "objective/train/docs_used": 1550039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.891376256942749, + "objective/train/theoretical_loss": 3.6622266665561307, + "objective/train/tokens_used": 983812576, + "theoretical_loss": 3.6622266665561307, + "tokens_seen": 963352576 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003576328986960883, + "loss": 2.8309, + "theoretical_loss": 3.662214730026558, + "tokens_seen": 963385344 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035762286860581747, + "loss": 2.8975, + "theoretical_loss": 3.6621908585263854, + "tokens_seen": 963450880 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035761283851554665, + "loss": 3.0278, + "theoretical_loss": 3.6621669891045743, + "tokens_seen": 963516416 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035760280842527583, + "loss": 2.8741, + "theoretical_loss": 3.6621431217608027, + "tokens_seen": 963581952 + }, + { + "epoch": 2.08, + "learning_rate": 0.000357592778335005, + "loss": 2.8167, + "theoretical_loss": 3.6621192564947487, + "tokens_seen": 963647488 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003575827482447342, + "loss": 3.0473, + "theoretical_loss": 3.6620953933060894, + "tokens_seen": 963713024 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003575727181544634, + "loss": 2.9775, + "theoretical_loss": 3.6620715321945028, + "tokens_seen": 963778560 + }, + { + "epoch": 2.08, + "learning_rate": 0.00035756268806419255, + "loss": 2.7443, + "theoretical_loss": 3.662047673159668, + "tokens_seen": 963844096 + }, + { + "epoch": 2.08, + "learning_rate": 0.0003575526579739218, + "loss": 2.8255, + "theoretical_loss": 3.6620238162012613, + "tokens_seen": 963909632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003575426278836509, + "loss": 2.7371, + "theoretical_loss": 3.661999961318962, + "tokens_seen": 963975168 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035753259779338015, + "loss": 3.0229, + "theoretical_loss": 3.6619761085124476, + "tokens_seen": 964040704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035752256770310933, + "loss": 3.0703, + "theoretical_loss": 3.6619522577813974, + "tokens_seen": 964106240 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003575125376128385, + "loss": 2.8687, + "theoretical_loss": 3.6619284091254887, + "tokens_seen": 964171776 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003575025075225677, + "loss": 2.737, + "theoretical_loss": 3.661904562544401, + "tokens_seen": 964237312 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035749247743229693, + "loss": 2.8293, + "theoretical_loss": 3.6618807180378115, + "tokens_seen": 964302848 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035748244734202606, + "loss": 2.8953, + "theoretical_loss": 3.6618568756054, + "tokens_seen": 964368384 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003574724172517553, + "loss": 2.8875, + "theoretical_loss": 3.661833035246844, + "tokens_seen": 964433920 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003574623871614844, + "loss": 2.8941, + "theoretical_loss": 3.6618091969618227, + "tokens_seen": 964499456 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035745235707121365, + "loss": 2.9434, + "theoretical_loss": 3.6617853607500153, + "tokens_seen": 964564992 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035744232698094283, + "loss": 2.6983, + "theoretical_loss": 3.6617615266111, + "tokens_seen": 964630528 + }, + { + "epoch": 2.09, + "learning_rate": 0.000357432296890672, + "loss": 2.9475, + "theoretical_loss": 3.6617376945447564, + "tokens_seen": 964696064 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003574222668004012, + "loss": 3.0527, + "theoretical_loss": 3.661713864550663, + "tokens_seen": 964761600 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003574122367101304, + "loss": 2.9406, + "theoretical_loss": 3.6616900366284986, + "tokens_seen": 964827136 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035740220661985956, + "loss": 2.8479, + "theoretical_loss": 3.661666210777943, + "tokens_seen": 964892672 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573921765295888, + "loss": 2.8537, + "theoretical_loss": 3.6616423869986754, + "tokens_seen": 964958208 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1552779, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0616397857666016, + "objective/train/theoretical_loss": 3.6616304758856737, + "objective/train/tokens_used": 985450976, + "theoretical_loss": 3.6616304758856737, + "tokens_seen": 964990976 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573821464393179, + "loss": 3.0101, + "theoretical_loss": 3.6616185652903743, + "tokens_seen": 965023744 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035737211634904716, + "loss": 2.8604, + "theoretical_loss": 3.6615947456527196, + "tokens_seen": 965089280 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573620862587763, + "loss": 2.9347, + "theoretical_loss": 3.661570928085391, + "tokens_seen": 965154816 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573520561685055, + "loss": 2.8127, + "theoretical_loss": 3.6615471125880674, + "tokens_seen": 965220352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035734202607823475, + "loss": 2.8614, + "theoretical_loss": 3.6615232991604283, + "tokens_seen": 965285888 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573319959879639, + "loss": 2.8776, + "theoretical_loss": 3.6614994878021543, + "tokens_seen": 965351424 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573219658976931, + "loss": 2.9654, + "theoretical_loss": 3.661475678512924, + "tokens_seen": 965416960 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573119358074223, + "loss": 2.9047, + "theoretical_loss": 3.6614518712924173, + "tokens_seen": 965482496 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003573019057171515, + "loss": 3.0663, + "theoretical_loss": 3.6614280661403154, + "tokens_seen": 965548032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035729187562688066, + "loss": 2.9703, + "theoretical_loss": 3.6614042630562964, + "tokens_seen": 965613568 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035728184553660984, + "loss": 2.9663, + "theoretical_loss": 3.661380462040041, + "tokens_seen": 965679104 + }, + { + "epoch": 2.09, + "learning_rate": 0.000357271815446339, + "loss": 2.8801, + "theoretical_loss": 3.6613566630912304, + "tokens_seen": 965744640 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035726178535606826, + "loss": 2.7741, + "theoretical_loss": 3.661332866209543, + "tokens_seen": 965810176 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003572517552657974, + "loss": 2.7699, + "theoretical_loss": 3.661309071394659, + "tokens_seen": 965875712 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003572417251755266, + "loss": 2.9245, + "theoretical_loss": 3.6612852786462606, + "tokens_seen": 965941248 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035723169508525575, + "loss": 2.7449, + "theoretical_loss": 3.661261487964026, + "tokens_seen": 966006784 + }, + { + "epoch": 2.09, + "learning_rate": 0.000357221664994985, + "loss": 2.8511, + "theoretical_loss": 3.6612376993476374, + "tokens_seen": 966072320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035721163490471416, + "loss": 3.0112, + "theoretical_loss": 3.661213912796774, + "tokens_seen": 966137856 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035720160481444334, + "loss": 2.9309, + "theoretical_loss": 3.661190128311117, + "tokens_seen": 966203392 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003571915747241725, + "loss": 2.7517, + "theoretical_loss": 3.6611663458903463, + "tokens_seen": 966268928 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035718154463390176, + "loss": 3.0415, + "theoretical_loss": 3.661142565534144, + "tokens_seen": 966334464 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003571715145436309, + "loss": 2.883, + "theoretical_loss": 3.661118787242189, + "tokens_seen": 966400000 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003571614844533601, + "loss": 2.8585, + "theoretical_loss": 3.661095011014164, + "tokens_seen": 966465536 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035715145436308925, + "loss": 2.8627, + "theoretical_loss": 3.6610712368497484, + "tokens_seen": 966531072 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003571414242728185, + "loss": 2.8245, + "theoretical_loss": 3.6610474647486244, + "tokens_seen": 966596608 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1555485, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.045332670211792, + "objective/train/theoretical_loss": 3.6610355794716964, + "objective/train/tokens_used": 987089376, + "theoretical_loss": 3.6610355794716964, + "tokens_seen": 966629376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035713139418254767, + "loss": 2.9291, + "theoretical_loss": 3.661023694710472, + "tokens_seen": 966662144 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035712136409227685, + "loss": 2.9159, + "theoretical_loss": 3.660999926734973, + "tokens_seen": 966727680 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035711133400200603, + "loss": 2.7909, + "theoretical_loss": 3.660976160821809, + "tokens_seen": 966793216 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003571013039117352, + "loss": 2.8746, + "theoretical_loss": 3.66095239697066, + "tokens_seen": 966858752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003570912738214644, + "loss": 2.856, + "theoretical_loss": 3.6609286351812083, + "tokens_seen": 966924288 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003570812437311936, + "loss": 2.8702, + "theoretical_loss": 3.6609048754531353, + "tokens_seen": 966989824 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035707121364092275, + "loss": 3.0169, + "theoretical_loss": 3.660881117786122, + "tokens_seen": 967055360 + }, + { + "epoch": 2.09, + "learning_rate": 0.000357061183550652, + "loss": 2.784, + "theoretical_loss": 3.6608573621798506, + "tokens_seen": 967120896 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003570511534603811, + "loss": 2.7637, + "theoretical_loss": 3.660833608634002, + "tokens_seen": 967186432 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035704112337011035, + "loss": 3.0043, + "theoretical_loss": 3.660809857148258, + "tokens_seen": 967251968 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035703109327983953, + "loss": 2.8788, + "theoretical_loss": 3.6607861077223007, + "tokens_seen": 967317504 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003570210631895687, + "loss": 2.6796, + "theoretical_loss": 3.660762360355812, + "tokens_seen": 967383040 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003570110330992979, + "loss": 2.9374, + "theoretical_loss": 3.6607386150484738, + "tokens_seen": 967448576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035700100300902713, + "loss": 2.8637, + "theoretical_loss": 3.660714871799968, + "tokens_seen": 967514112 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035699097291875626, + "loss": 3.0618, + "theoretical_loss": 3.660691130609976, + "tokens_seen": 967579648 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003569809428284855, + "loss": 3.0206, + "theoretical_loss": 3.6606673914781807, + "tokens_seen": 967645184 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003569709127382146, + "loss": 2.8269, + "theoretical_loss": 3.6606436544042644, + "tokens_seen": 967710720 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035696088264794385, + "loss": 2.9866, + "theoretical_loss": 3.660619919387909, + "tokens_seen": 967776256 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035695085255767303, + "loss": 2.9505, + "theoretical_loss": 3.660596186428797, + "tokens_seen": 967841792 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003569408224674022, + "loss": 2.9909, + "theoretical_loss": 3.6605724555266104, + "tokens_seen": 967907328 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003569307923771314, + "loss": 2.9199, + "theoretical_loss": 3.6605487266810317, + "tokens_seen": 967972864 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003569207622868606, + "loss": 2.916, + "theoretical_loss": 3.6605249998917437, + "tokens_seen": 968038400 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035691073219658976, + "loss": 2.9733, + "theoretical_loss": 3.6605012751584294, + "tokens_seen": 968103936 + }, + { + "epoch": 2.09, + "learning_rate": 0.000356900702106319, + "loss": 2.9209, + "theoretical_loss": 3.6604775524807707, + "tokens_seen": 968169472 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568906720160481, + "loss": 2.8504, + "theoretical_loss": 3.660453831858451, + "tokens_seen": 968235008 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1556912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0787220001220703, + "objective/train/theoretical_loss": 3.660441972317944, + "objective/train/tokens_used": 988727776, + "theoretical_loss": 3.660441972317944, + "tokens_seen": 968267776 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035688064192577736, + "loss": 3.0057, + "theoretical_loss": 3.660430113291153, + "tokens_seen": 968300544 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568706118355065, + "loss": 2.9431, + "theoretical_loss": 3.6604063967785594, + "tokens_seen": 968366080 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568605817452357, + "loss": 3.0493, + "theoretical_loss": 3.6603826823203534, + "tokens_seen": 968431616 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568505516549649, + "loss": 2.9242, + "theoretical_loss": 3.6603589699162176, + "tokens_seen": 968497152 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568405215646941, + "loss": 2.8715, + "theoretical_loss": 3.6603352595658354, + "tokens_seen": 968562688 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035683049147442326, + "loss": 3.0195, + "theoretical_loss": 3.6603115512688897, + "tokens_seen": 968628224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568204613841525, + "loss": 2.8765, + "theoretical_loss": 3.6602878450250644, + "tokens_seen": 968693760 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003568104312938816, + "loss": 2.6866, + "theoretical_loss": 3.6602641408340424, + "tokens_seen": 968759296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035680040120361086, + "loss": 2.8566, + "theoretical_loss": 3.660240438695507, + "tokens_seen": 968824832 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035679037111334, + "loss": 2.7487, + "theoretical_loss": 3.660216738609142, + "tokens_seen": 968890368 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003567803410230692, + "loss": 2.9709, + "theoretical_loss": 3.6601930405746304, + "tokens_seen": 968955904 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003567703109327984, + "loss": 2.8686, + "theoretical_loss": 3.660169344591656, + "tokens_seen": 969021440 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003567602808425276, + "loss": 2.8023, + "theoretical_loss": 3.660145650659903, + "tokens_seen": 969086976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035675025075225677, + "loss": 3.0712, + "theoretical_loss": 3.660121958779054, + "tokens_seen": 969152512 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035674022066198595, + "loss": 2.8656, + "theoretical_loss": 3.660098268948794, + "tokens_seen": 969218048 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035673019057171513, + "loss": 2.9416, + "theoretical_loss": 3.6600745811688062, + "tokens_seen": 969283584 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035672016048144436, + "loss": 3.0458, + "theoretical_loss": 3.6600508954387747, + "tokens_seen": 969349120 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003567101303911735, + "loss": 2.8099, + "theoretical_loss": 3.6600272117583836, + "tokens_seen": 969414656 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003567001003009027, + "loss": 2.9568, + "theoretical_loss": 3.6600035301273177, + "tokens_seen": 969480192 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035669007021063185, + "loss": 2.8503, + "theoretical_loss": 3.65997985054526, + "tokens_seen": 969545728 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003566800401203611, + "loss": 2.8997, + "theoretical_loss": 3.659956173011895, + "tokens_seen": 969611264 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035667001003009027, + "loss": 3.0216, + "theoretical_loss": 3.6599324975269063, + "tokens_seen": 969676800 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035665997993981945, + "loss": 2.8945, + "theoretical_loss": 3.65990882408998, + "tokens_seen": 969742336 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035664994984954863, + "loss": 2.9115, + "theoretical_loss": 3.6598851527007996, + "tokens_seen": 969807872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035663991975927787, + "loss": 2.8282, + "theoretical_loss": 3.6598614833590495, + "tokens_seen": 969873408 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1559557, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6703498363494873, + "objective/train/theoretical_loss": 3.6598496494558623, + "objective/train/tokens_used": 990366176, + "theoretical_loss": 3.6598496494558623, + "tokens_seen": 969906176 + }, + { + "epoch": 2.09, + "learning_rate": 0.000356629889669007, + "loss": 2.7548, + "theoretical_loss": 3.6598378160644147, + "tokens_seen": 969938944 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035661985957873623, + "loss": 3.001, + "theoretical_loss": 3.659814150816579, + "tokens_seen": 970004480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035660982948846536, + "loss": 2.8732, + "theoretical_loss": 3.6597904876152283, + "tokens_seen": 970070016 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565997993981946, + "loss": 2.9293, + "theoretical_loss": 3.6597668264600465, + "tokens_seen": 970135552 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565897693079238, + "loss": 2.8065, + "theoretical_loss": 3.659743167350719, + "tokens_seen": 970201088 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035657973921765295, + "loss": 3.007, + "theoretical_loss": 3.65971951028693, + "tokens_seen": 970266624 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565697091273822, + "loss": 2.9491, + "theoretical_loss": 3.659695855268365, + "tokens_seen": 970332160 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565596790371113, + "loss": 2.9417, + "theoretical_loss": 3.6596722022947095, + "tokens_seen": 970397696 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035654964894684055, + "loss": 2.7599, + "theoretical_loss": 3.659648551365648, + "tokens_seen": 970463232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035653961885656973, + "loss": 2.8092, + "theoretical_loss": 3.659624902480866, + "tokens_seen": 970528768 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565295887662989, + "loss": 2.7742, + "theoretical_loss": 3.6596012556400486, + "tokens_seen": 970594304 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003565195586760281, + "loss": 2.6958, + "theoretical_loss": 3.6595776108428812, + "tokens_seen": 970659840 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035650952858575733, + "loss": 3.0784, + "theoretical_loss": 3.659553968089049, + "tokens_seen": 970725376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035649949849548646, + "loss": 2.7855, + "theoretical_loss": 3.659530327378238, + "tokens_seen": 970790912 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564894684052157, + "loss": 2.9345, + "theoretical_loss": 3.6595066887101337, + "tokens_seen": 970856448 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564794383149448, + "loss": 2.897, + "theoretical_loss": 3.659483052084421, + "tokens_seen": 970921984 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035646940822467405, + "loss": 2.7902, + "theoretical_loss": 3.6594594175007864, + "tokens_seen": 970987520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035645937813440323, + "loss": 2.9486, + "theoretical_loss": 3.659435784958915, + "tokens_seen": 971053056 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564493480441324, + "loss": 2.9779, + "theoretical_loss": 3.6594121544584937, + "tokens_seen": 971118592 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564393179538616, + "loss": 2.9218, + "theoretical_loss": 3.6593885259992067, + "tokens_seen": 971184128 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564292878635908, + "loss": 2.9117, + "theoretical_loss": 3.6593648995807415, + "tokens_seen": 971249664 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035641925777331996, + "loss": 2.8808, + "theoretical_loss": 3.659341275202784, + "tokens_seen": 971315200 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003564092276830492, + "loss": 2.8918, + "theoretical_loss": 3.6593176528650195, + "tokens_seen": 971380736 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563991975927783, + "loss": 2.8654, + "theoretical_loss": 3.659294032567135, + "tokens_seen": 971446272 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035638916750250756, + "loss": 2.9417, + "theoretical_loss": 3.659270414308816, + "tokens_seen": 971511808 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1562551, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.907285213470459, + "objective/train/theoretical_loss": 3.6592586059443954, + "objective/train/tokens_used": 992004576, + "theoretical_loss": 3.6592586059443954, + "tokens_seen": 971544576 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563791374122367, + "loss": 2.8696, + "theoretical_loss": 3.6592467980897494, + "tokens_seen": 971577344 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563691073219659, + "loss": 2.8847, + "theoretical_loss": 3.6592231839096208, + "tokens_seen": 971642880 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563590772316951, + "loss": 2.8909, + "theoretical_loss": 3.6591995717681174, + "tokens_seen": 971708416 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563490471414243, + "loss": 2.7159, + "theoretical_loss": 3.659175961664926, + "tokens_seen": 971773952 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035633901705115346, + "loss": 2.7371, + "theoretical_loss": 3.659152353599733, + "tokens_seen": 971839488 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563289869608827, + "loss": 3.0275, + "theoretical_loss": 3.659128747572224, + "tokens_seen": 971905024 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003563189568706118, + "loss": 2.9628, + "theoretical_loss": 3.659105143582087, + "tokens_seen": 971970560 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035630892678034106, + "loss": 2.6074, + "theoretical_loss": 3.6590815416290083, + "tokens_seen": 972036096 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562988966900702, + "loss": 2.8997, + "theoretical_loss": 3.6590579417126747, + "tokens_seen": 972101632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562888665997994, + "loss": 2.7726, + "theoretical_loss": 3.659034343832774, + "tokens_seen": 972167168 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562788365095286, + "loss": 2.8777, + "theoretical_loss": 3.659010747988992, + "tokens_seen": 972232704 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562688064192578, + "loss": 3.0095, + "theoretical_loss": 3.658987154181016, + "tokens_seen": 972298240 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035625877632898697, + "loss": 2.9822, + "theoretical_loss": 3.6589635624085335, + "tokens_seen": 972363776 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035624874623871615, + "loss": 2.77, + "theoretical_loss": 3.6589399726712317, + "tokens_seen": 972429312 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035623871614844533, + "loss": 2.9554, + "theoretical_loss": 3.6589163849687982, + "tokens_seen": 972494848 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035622868605817456, + "loss": 2.8655, + "theoretical_loss": 3.6588927993009195, + "tokens_seen": 972560384 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562186559679037, + "loss": 2.7351, + "theoretical_loss": 3.658869215667284, + "tokens_seen": 972625920 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003562086258776329, + "loss": 3.0021, + "theoretical_loss": 3.658845634067579, + "tokens_seen": 972691456 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035619859578736205, + "loss": 3.0018, + "theoretical_loss": 3.658822054501491, + "tokens_seen": 972756992 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003561885656970913, + "loss": 2.9054, + "theoretical_loss": 3.658798476968709, + "tokens_seen": 972822528 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035617853560682047, + "loss": 2.8587, + "theoretical_loss": 3.65877490146892, + "tokens_seen": 972888064 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035616850551654965, + "loss": 2.8948, + "theoretical_loss": 3.658751328001812, + "tokens_seen": 972953600 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035615847542627883, + "loss": 2.8, + "theoretical_loss": 3.658727756567073, + "tokens_seen": 973019136 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035614844533600807, + "loss": 2.9932, + "theoretical_loss": 3.65870418716439, + "tokens_seen": 973084672 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003561384152457372, + "loss": 2.8173, + "theoretical_loss": 3.658680619793452, + "tokens_seen": 973150208 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1565549, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7954471111297607, + "objective/train/theoretical_loss": 3.65866883686979, + "objective/train/tokens_used": 993642976, + "theoretical_loss": 3.65866883686979, + "tokens_seen": 973182976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035612838515546643, + "loss": 2.6984, + "theoretical_loss": 3.658657054453947, + "tokens_seen": 973215744 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035611835506519556, + "loss": 2.8188, + "theoretical_loss": 3.6586334911455625, + "tokens_seen": 973281280 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003561083249749248, + "loss": 3.0048, + "theoretical_loss": 3.658609929867987, + "tokens_seen": 973346816 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035609829488465397, + "loss": 2.9668, + "theoretical_loss": 3.6585863706209087, + "tokens_seen": 973412352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035608826479438315, + "loss": 2.7635, + "theoretical_loss": 3.658562813404016, + "tokens_seen": 973477888 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035607823470411233, + "loss": 2.7039, + "theoretical_loss": 3.6585392582169973, + "tokens_seen": 973543424 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003560682046138415, + "loss": 2.8891, + "theoretical_loss": 3.6585157050595414, + "tokens_seen": 973608960 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003560581745235707, + "loss": 3.0044, + "theoretical_loss": 3.658492153931336, + "tokens_seen": 973674496 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035604814443329993, + "loss": 3.0258, + "theoretical_loss": 3.6584686048320707, + "tokens_seen": 973740032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035603811434302906, + "loss": 2.9133, + "theoretical_loss": 3.6584450577614334, + "tokens_seen": 973805568 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003560280842527583, + "loss": 2.8831, + "theoretical_loss": 3.658421512719113, + "tokens_seen": 973871104 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003560180541624874, + "loss": 2.9003, + "theoretical_loss": 3.658397969704798, + "tokens_seen": 973936640 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035600802407221666, + "loss": 2.9687, + "theoretical_loss": 3.6583744287181785, + "tokens_seen": 974002176 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035599799398194584, + "loss": 2.8831, + "theoretical_loss": 3.6583508897589425, + "tokens_seen": 974067712 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355987963891675, + "loss": 2.7567, + "theoretical_loss": 3.658327352826779, + "tokens_seen": 974133248 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003559779338014042, + "loss": 2.9633, + "theoretical_loss": 3.658303817921377, + "tokens_seen": 974198784 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035596790371113344, + "loss": 3.0587, + "theoretical_loss": 3.6582802850424256, + "tokens_seen": 974264320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035595787362086256, + "loss": 2.9496, + "theoretical_loss": 3.658256754189615, + "tokens_seen": 974329856 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003559478435305918, + "loss": 2.8151, + "theoretical_loss": 3.6582332253626335, + "tokens_seen": 974395392 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003559378134403209, + "loss": 3.0601, + "theoretical_loss": 3.65820969856117, + "tokens_seen": 974460928 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035592778335005016, + "loss": 2.8177, + "theoretical_loss": 3.658186173784916, + "tokens_seen": 974526464 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035591775325977934, + "loss": 2.8862, + "theoretical_loss": 3.6581626510335585, + "tokens_seen": 974592000 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003559077231695085, + "loss": 2.8326, + "theoretical_loss": 3.6581391303067887, + "tokens_seen": 974657536 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003558976930792377, + "loss": 2.7373, + "theoretical_loss": 3.6581156116042957, + "tokens_seen": 974723072 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003558876629889669, + "loss": 2.8724, + "theoretical_loss": 3.658092094925769, + "tokens_seen": 974788608 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1568455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8843460083007812, + "objective/train/theoretical_loss": 3.658080337345396, + "objective/train/tokens_used": 995281376, + "theoretical_loss": 3.658080337345396, + "tokens_seen": 974821376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035587763289869607, + "loss": 2.914, + "theoretical_loss": 3.658068580270899, + "tokens_seen": 974854144 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003558676028084253, + "loss": 2.9226, + "theoretical_loss": 3.6580450676393745, + "tokens_seen": 974919680 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035585757271815443, + "loss": 2.9577, + "theoretical_loss": 3.6580215570308865, + "tokens_seen": 974985216 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035584754262788366, + "loss": 2.9154, + "theoretical_loss": 3.6579980484451244, + "tokens_seen": 975050752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003558375125376129, + "loss": 2.6637, + "theoretical_loss": 3.6579745418817784, + "tokens_seen": 975116288 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355827482447342, + "loss": 2.8775, + "theoretical_loss": 3.6579510373405384, + "tokens_seen": 975181824 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035581745235707126, + "loss": 2.751, + "theoretical_loss": 3.657927534821095, + "tokens_seen": 975247360 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003558074222668004, + "loss": 2.9216, + "theoretical_loss": 3.6579040343231375, + "tokens_seen": 975312896 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003557973921765296, + "loss": 2.8816, + "theoretical_loss": 3.6578805358463575, + "tokens_seen": 975378432 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003557873620862588, + "loss": 2.8364, + "theoretical_loss": 3.6578570393904446, + "tokens_seen": 975443968 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355777331995988, + "loss": 2.8176, + "theoretical_loss": 3.6578335449550896, + "tokens_seen": 975509504 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035576730190571717, + "loss": 2.7372, + "theoretical_loss": 3.6578100525399826, + "tokens_seen": 975575040 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035575727181544635, + "loss": 2.7369, + "theoretical_loss": 3.657786562144815, + "tokens_seen": 975640576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035574724172517553, + "loss": 3.0426, + "theoretical_loss": 3.657763073769276, + "tokens_seen": 975706112 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035573721163490476, + "loss": 2.8588, + "theoretical_loss": 3.657739587413057, + "tokens_seen": 975771648 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003557271815446339, + "loss": 3.0085, + "theoretical_loss": 3.65771610307585, + "tokens_seen": 975837184 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003557171514543631, + "loss": 2.917, + "theoretical_loss": 3.657692620757344, + "tokens_seen": 975902720 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035570712136409225, + "loss": 2.8636, + "theoretical_loss": 3.6576691404572315, + "tokens_seen": 975968256 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003556970912738215, + "loss": 2.8624, + "theoretical_loss": 3.6576456621752023, + "tokens_seen": 976033792 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035568706118355067, + "loss": 2.8121, + "theoretical_loss": 3.657622185910948, + "tokens_seen": 976099328 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035567703109327985, + "loss": 2.7509, + "theoretical_loss": 3.6575987116641597, + "tokens_seen": 976164864 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035566700100300903, + "loss": 2.834, + "theoretical_loss": 3.6575752394345287, + "tokens_seen": 976230400 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035565697091273827, + "loss": 2.9119, + "theoretical_loss": 3.6575517692217456, + "tokens_seen": 976295936 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003556469408224674, + "loss": 2.8318, + "theoretical_loss": 3.6575283010255033, + "tokens_seen": 976361472 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035563691073219663, + "loss": 3.0203, + "theoretical_loss": 3.657504834845491, + "tokens_seen": 976427008 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1571348, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81406307220459, + "objective/train/theoretical_loss": 3.6574931025114754, + "objective/train/tokens_used": 996919776, + "theoretical_loss": 3.6574931025114754, + "tokens_seen": 976459776 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035562688064192576, + "loss": 2.8555, + "theoretical_loss": 3.6574813706814018, + "tokens_seen": 976492544 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355616850551655, + "loss": 2.8832, + "theoretical_loss": 3.657457908532927, + "tokens_seen": 976558080 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035560682046138417, + "loss": 2.9666, + "theoretical_loss": 3.6574344483997576, + "tokens_seen": 976623616 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035559679037111335, + "loss": 2.9622, + "theoretical_loss": 3.657410990281586, + "tokens_seen": 976689152 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035558676028084253, + "loss": 3.0374, + "theoretical_loss": 3.6573875341781035, + "tokens_seen": 976754688 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003555767301905717, + "loss": 2.922, + "theoretical_loss": 3.6573640800890015, + "tokens_seen": 976820224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003555667001003009, + "loss": 2.9717, + "theoretical_loss": 3.657340628013973, + "tokens_seen": 976885760 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035555667001003013, + "loss": 2.8174, + "theoretical_loss": 3.6573171779527085, + "tokens_seen": 976951296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035554663991975926, + "loss": 2.8473, + "theoretical_loss": 3.657293729904902, + "tokens_seen": 977016832 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003555366098294885, + "loss": 2.9264, + "theoretical_loss": 3.6572702838702433, + "tokens_seen": 977082368 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003555265797392176, + "loss": 2.82, + "theoretical_loss": 3.6572468398484266, + "tokens_seen": 977147904 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035551654964894686, + "loss": 2.8304, + "theoretical_loss": 3.6572233978391426, + "tokens_seen": 977213440 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035550651955867604, + "loss": 2.8448, + "theoretical_loss": 3.6571999578420846, + "tokens_seen": 977278976 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003554964894684052, + "loss": 2.87, + "theoretical_loss": 3.657176519856944, + "tokens_seen": 977344512 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003554864593781344, + "loss": 2.9379, + "theoretical_loss": 3.657153083883414, + "tokens_seen": 977410048 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035547642928786364, + "loss": 2.993, + "theoretical_loss": 3.657129649921187, + "tokens_seen": 977475584 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035546639919759276, + "loss": 2.8444, + "theoretical_loss": 3.6571062179699556, + "tokens_seen": 977541120 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355456369107322, + "loss": 2.7946, + "theoretical_loss": 3.6570827880294114, + "tokens_seen": 977606656 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003554463390170511, + "loss": 2.9484, + "theoretical_loss": 3.6570593600992485, + "tokens_seen": 977672192 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035543630892678036, + "loss": 2.989, + "theoretical_loss": 3.6570359341791585, + "tokens_seen": 977737728 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035542627883650954, + "loss": 2.8477, + "theoretical_loss": 3.657012510268835, + "tokens_seen": 977803264 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003554162487462387, + "loss": 2.8392, + "theoretical_loss": 3.656989088367971, + "tokens_seen": 977868800 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003554062186559679, + "loss": 3.0656, + "theoretical_loss": 3.656965668476259, + "tokens_seen": 977934336 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003553961885656971, + "loss": 2.8315, + "theoretical_loss": 3.656942250593392, + "tokens_seen": 977999872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035538615847542627, + "loss": 2.8929, + "theoretical_loss": 3.6569188347190638, + "tokens_seen": 978065408 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1573654, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6522109508514404, + "objective/train/theoretical_loss": 3.656907127535005, + "objective/train/tokens_used": 998558176, + "theoretical_loss": 3.656907127535005, + "tokens_seen": 978098176 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003553761283851555, + "loss": 2.863, + "theoretical_loss": 3.6568954208529663, + "tokens_seen": 978130944 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035536609829488463, + "loss": 2.8714, + "theoretical_loss": 3.6568720089947937, + "tokens_seen": 978196480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035535606820461386, + "loss": 2.8873, + "theoretical_loss": 3.656848599144239, + "tokens_seen": 978262016 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035534603811434304, + "loss": 2.8658, + "theoretical_loss": 3.656825191300996, + "tokens_seen": 978327552 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003553360080240722, + "loss": 2.8011, + "theoretical_loss": 3.656801785464757, + "tokens_seen": 978393088 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003553259779338014, + "loss": 2.9733, + "theoretical_loss": 3.656778381635217, + "tokens_seen": 978458624 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003553159478435306, + "loss": 2.89, + "theoretical_loss": 3.656754979812068, + "tokens_seen": 978524160 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035530591775325977, + "loss": 2.8537, + "theoretical_loss": 3.6567315799950046, + "tokens_seen": 978589696 + }, + { + "epoch": 2.09, + "learning_rate": 0.000355295887662989, + "loss": 2.9651, + "theoretical_loss": 3.656708182183721, + "tokens_seen": 978655232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035528585757271813, + "loss": 2.9319, + "theoretical_loss": 3.6566847863779097, + "tokens_seen": 978720768 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035527582748244737, + "loss": 2.8819, + "theoretical_loss": 3.6566613925772655, + "tokens_seen": 978786304 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003552657973921765, + "loss": 2.9274, + "theoretical_loss": 3.6566380007814816, + "tokens_seen": 978851840 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035525576730190573, + "loss": 2.9145, + "theoretical_loss": 3.656614610990253, + "tokens_seen": 978917376 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003552457372116349, + "loss": 3.0293, + "theoretical_loss": 3.6565912232032725, + "tokens_seen": 978982912 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003552357071213641, + "loss": 2.9593, + "theoretical_loss": 3.6565678374202344, + "tokens_seen": 979048448 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035522567703109327, + "loss": 2.7397, + "theoretical_loss": 3.656544453640834, + "tokens_seen": 979113984 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035521564694082245, + "loss": 2.8568, + "theoretical_loss": 3.6565210718647645, + "tokens_seen": 979179520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035520561685055163, + "loss": 2.8202, + "theoretical_loss": 3.6564976920917207, + "tokens_seen": 979245056 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035519558676028087, + "loss": 2.8251, + "theoretical_loss": 3.6564743143213962, + "tokens_seen": 979310592 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035518555667001, + "loss": 2.9351, + "theoretical_loss": 3.656450938553486, + "tokens_seen": 979376128 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035517552657973923, + "loss": 2.7654, + "theoretical_loss": 3.656427564787685, + "tokens_seen": 979441664 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003551654964894684, + "loss": 2.7671, + "theoretical_loss": 3.6564041930236875, + "tokens_seen": 979507200 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003551554663991976, + "loss": 2.8504, + "theoretical_loss": 3.6563808232611876, + "tokens_seen": 979572736 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003551454363089268, + "loss": 3.0496, + "theoretical_loss": 3.656357455499881, + "tokens_seen": 979638272 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035513540621865596, + "loss": 2.8164, + "theoretical_loss": 3.6563340897394614, + "tokens_seen": 979703808 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1575093, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7306783199310303, + "objective/train/theoretical_loss": 3.6563224076094887, + "objective/train/tokens_used": 1000196576, + "theoretical_loss": 3.6563224076094887, + "tokens_seen": 979736576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035512537612838514, + "loss": 2.8258, + "theoretical_loss": 3.6563107259796235, + "tokens_seen": 979769344 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035511534603811437, + "loss": 3.0038, + "theoretical_loss": 3.6562873642200637, + "tokens_seen": 979834880 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003551053159478435, + "loss": 2.8718, + "theoretical_loss": 3.656264004460476, + "tokens_seen": 979900416 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035509528585757273, + "loss": 2.9268, + "theoretical_loss": 3.6562406467005557, + "tokens_seen": 979965952 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003550852557673019, + "loss": 2.7059, + "theoretical_loss": 3.656217290939998, + "tokens_seen": 980031488 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003550752256770311, + "loss": 2.8494, + "theoretical_loss": 3.6561939371784975, + "tokens_seen": 980097024 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035506519558676033, + "loss": 2.892, + "theoretical_loss": 3.6561705854157496, + "tokens_seen": 980162560 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035505516549648946, + "loss": 2.9812, + "theoretical_loss": 3.65614723565145, + "tokens_seen": 980228096 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003550451354062187, + "loss": 2.9687, + "theoretical_loss": 3.656123887885294, + "tokens_seen": 980293632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003550351053159478, + "loss": 2.9783, + "theoretical_loss": 3.6561005421169765, + "tokens_seen": 980359168 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035502507522567706, + "loss": 2.8099, + "theoretical_loss": 3.6560771983461944, + "tokens_seen": 980424704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035501504513540624, + "loss": 2.8949, + "theoretical_loss": 3.6560538565726413, + "tokens_seen": 980490240 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003550050150451354, + "loss": 2.8761, + "theoretical_loss": 3.6560305167960143, + "tokens_seen": 980555776 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549949849548646, + "loss": 2.807, + "theoretical_loss": 3.656007179016009, + "tokens_seen": 980621312 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035498495486459384, + "loss": 2.9114, + "theoretical_loss": 3.6559838432323204, + "tokens_seen": 980686848 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035497492477432296, + "loss": 2.9224, + "theoretical_loss": 3.655960509444645, + "tokens_seen": 980752384 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549648946840522, + "loss": 2.885, + "theoretical_loss": 3.655937177652678, + "tokens_seen": 980817920 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549548645937813, + "loss": 2.9848, + "theoretical_loss": 3.655913847856117, + "tokens_seen": 980883456 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035494483450351056, + "loss": 3.03, + "theoretical_loss": 3.6558905200546556, + "tokens_seen": 980948992 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035493480441323974, + "loss": 2.8531, + "theoretical_loss": 3.655867194247992, + "tokens_seen": 981014528 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549247743229689, + "loss": 2.7006, + "theoretical_loss": 3.655843870435822, + "tokens_seen": 981080064 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549147442326981, + "loss": 2.7718, + "theoretical_loss": 3.655820548617841, + "tokens_seen": 981145600 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003549047141424273, + "loss": 2.8602, + "theoretical_loss": 3.6557972287937455, + "tokens_seen": 981211136 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035489468405215647, + "loss": 2.8885, + "theoretical_loss": 3.655773910963233, + "tokens_seen": 981276672 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003548846539618857, + "loss": 2.8026, + "theoretical_loss": 3.655750595125998, + "tokens_seen": 981342208 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1577929, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.749990224838257, + "objective/train/theoretical_loss": 3.6557389379547653, + "objective/train/tokens_used": 1001834976, + "theoretical_loss": 3.6557389379547653, + "tokens_seen": 981374976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035487462387161483, + "loss": 2.851, + "theoretical_loss": 3.655727281281739, + "tokens_seen": 981407744 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035486459378134406, + "loss": 2.8963, + "theoretical_loss": 3.655703969430151, + "tokens_seen": 981473280 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035485456369107324, + "loss": 2.9964, + "theoretical_loss": 3.6556806595709315, + "tokens_seen": 981538816 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003548445336008024, + "loss": 2.9695, + "theoretical_loss": 3.655657351703778, + "tokens_seen": 981604352 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003548345035105316, + "loss": 2.9344, + "theoretical_loss": 3.655634045828385, + "tokens_seen": 981669888 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003548244734202608, + "loss": 2.9826, + "theoretical_loss": 3.655610741944451, + "tokens_seen": 981735424 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035481444332998997, + "loss": 2.8594, + "theoretical_loss": 3.655587440051673, + "tokens_seen": 981800960 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003548044132397192, + "loss": 2.846, + "theoretical_loss": 3.6555641401497474, + "tokens_seen": 981866496 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035479438314944833, + "loss": 3.0244, + "theoretical_loss": 3.6555408422383713, + "tokens_seen": 981932032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035478435305917757, + "loss": 2.8844, + "theoretical_loss": 3.655517546317242, + "tokens_seen": 981997568 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003547743229689067, + "loss": 2.7654, + "theoretical_loss": 3.655494252386056, + "tokens_seen": 982063104 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035476429287863593, + "loss": 2.9228, + "theoretical_loss": 3.6554709604445117, + "tokens_seen": 982128640 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003547542627883651, + "loss": 2.799, + "theoretical_loss": 3.655447670492306, + "tokens_seen": 982194176 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003547442326980943, + "loss": 2.9884, + "theoretical_loss": 3.6554243825291355, + "tokens_seen": 982259712 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035473420260782347, + "loss": 2.8228, + "theoretical_loss": 3.655401096554699, + "tokens_seen": 982325248 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035472417251755265, + "loss": 2.8304, + "theoretical_loss": 3.655377812568693, + "tokens_seen": 982390784 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035471414242728183, + "loss": 3.0387, + "theoretical_loss": 3.655354530570815, + "tokens_seen": 982456320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035470411233701107, + "loss": 2.972, + "theoretical_loss": 3.6553312505607627, + "tokens_seen": 982521856 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003546940822467402, + "loss": 2.7859, + "theoretical_loss": 3.6553079725382345, + "tokens_seen": 982587392 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035468405215646943, + "loss": 2.8641, + "theoretical_loss": 3.6552846965029273, + "tokens_seen": 982652928 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003546740220661986, + "loss": 2.8892, + "theoretical_loss": 3.6552614224545397, + "tokens_seen": 982718464 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003546639919759278, + "loss": 2.9978, + "theoretical_loss": 3.655238150392769, + "tokens_seen": 982784000 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354653961885657, + "loss": 3.0053, + "theoretical_loss": 3.6552148803173137, + "tokens_seen": 982849536 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035464393179538616, + "loss": 3.034, + "theoretical_loss": 3.6551916122278714, + "tokens_seen": 982915072 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035463390170511534, + "loss": 2.9509, + "theoretical_loss": 3.6551683461241407, + "tokens_seen": 982980608 + }, + { + "debugging/Self-BLEU-5": 0.5837910905496678, + "debugging/distinct-1-grams": 0.7445929750279571, + "debugging/distinct-2-grams": 0.9491162952355862, + "debugging/entropy-1-grams": 6.265900999935033, + "debugging/entropy-2-grams": 7.4131088852498825, + "debugging/length": 545.375, + "debugging/num_segments": 24, + "epoch": 2.09, + "objective/train/docs_used": 1580721, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713019609451294, + "objective/train/theoretical_loss": 3.6551567138168224, + "objective/train/tokens_used": 1003473376, + "theoretical_loss": 3.6551567138168224, + "tokens_seen": 983013376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035462387161484457, + "loss": 2.8688, + "theoretical_loss": 3.655145082005819, + "tokens_seen": 983046144 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003546138415245737, + "loss": 2.8757, + "theoretical_loss": 3.655121819872605, + "tokens_seen": 983111680 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035460381143430293, + "loss": 2.8305, + "theoretical_loss": 3.6550985597241974, + "tokens_seen": 983177216 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035459378134403206, + "loss": 2.8541, + "theoretical_loss": 3.6550753015602937, + "tokens_seen": 983242752 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003545837512537613, + "loss": 2.8491, + "theoretical_loss": 3.655052045380593, + "tokens_seen": 983308288 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003545737211634905, + "loss": 2.9072, + "theoretical_loss": 3.655028791184793, + "tokens_seen": 983373824 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035456369107321966, + "loss": 2.7397, + "theoretical_loss": 3.6550055389725937, + "tokens_seen": 983439360 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035455366098294884, + "loss": 2.8018, + "theoretical_loss": 3.654982288743692, + "tokens_seen": 983504896 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354543630892678, + "loss": 2.9116, + "theoretical_loss": 3.654959040497788, + "tokens_seen": 983570432 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003545336008024072, + "loss": 3.0048, + "theoretical_loss": 3.6549357942345795, + "tokens_seen": 983635968 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035452357071213644, + "loss": 2.8934, + "theoretical_loss": 3.654912549953766, + "tokens_seen": 983701504 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035451354062186556, + "loss": 2.9536, + "theoretical_loss": 3.654889307655046, + "tokens_seen": 983767040 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003545035105315948, + "loss": 2.8604, + "theoretical_loss": 3.6548660673381184, + "tokens_seen": 983832576 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354493480441324, + "loss": 2.9569, + "theoretical_loss": 3.654842829002683, + "tokens_seen": 983898112 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035448345035105316, + "loss": 2.8836, + "theoretical_loss": 3.6548195926484377, + "tokens_seen": 983963648 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035447342026078234, + "loss": 2.9079, + "theoretical_loss": 3.654796358275082, + "tokens_seen": 984029184 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003544633901705115, + "loss": 2.7358, + "theoretical_loss": 3.6547731258823157, + "tokens_seen": 984094720 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003544533600802407, + "loss": 2.7322, + "theoretical_loss": 3.654749895469838, + "tokens_seen": 984160256 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035444332998996994, + "loss": 2.896, + "theoretical_loss": 3.654726667037348, + "tokens_seen": 984225792 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035443329989969907, + "loss": 2.8145, + "theoretical_loss": 3.6547034405845444, + "tokens_seen": 984291328 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003544232698094283, + "loss": 2.8932, + "theoretical_loss": 3.654680216111128, + "tokens_seen": 984356864 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035441323971915743, + "loss": 2.897, + "theoretical_loss": 3.654656993616798, + "tokens_seen": 984422400 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035440320962888667, + "loss": 3.0134, + "theoretical_loss": 3.6546337731012533, + "tokens_seen": 984487936 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035439317953861585, + "loss": 2.5627, + "theoretical_loss": 3.654610554564194, + "tokens_seen": 984553472 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035438314944834503, + "loss": 2.9534, + "theoretical_loss": 3.65458733800532, + "tokens_seen": 984619008 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1583365, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.015463352203369, + "objective/train/theoretical_loss": 3.6545757304676085, + "objective/train/tokens_used": 1005111776, + "theoretical_loss": 3.6545757304676085, + "tokens_seen": 984651776 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003543731193580742, + "loss": 2.839, + "theoretical_loss": 3.654564123424331, + "tokens_seen": 984684544 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035436308926780344, + "loss": 2.9924, + "theoretical_loss": 3.654540910820927, + "tokens_seen": 984750080 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003543530591775326, + "loss": 2.9996, + "theoretical_loss": 3.6545177001948073, + "tokens_seen": 984815616 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003543430290872618, + "loss": 2.8594, + "theoretical_loss": 3.654494491545673, + "tokens_seen": 984881152 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354332998996991, + "loss": 2.8206, + "theoretical_loss": 3.6544712848732237, + "tokens_seen": 984946688 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035432296890672017, + "loss": 2.7681, + "theoretical_loss": 3.6544480801771586, + "tokens_seen": 985012224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003543129388164494, + "loss": 3.0297, + "theoretical_loss": 3.6544248774571795, + "tokens_seen": 985077760 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035430290872617853, + "loss": 2.7273, + "theoretical_loss": 3.654401676712985, + "tokens_seen": 985143296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035429287863590777, + "loss": 2.6638, + "theoretical_loss": 3.6543784779442774, + "tokens_seen": 985208832 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003542828485456369, + "loss": 2.8167, + "theoretical_loss": 3.654355281150756, + "tokens_seen": 985274368 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035427281845536613, + "loss": 2.9187, + "theoretical_loss": 3.6543320863321203, + "tokens_seen": 985339904 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003542627883650953, + "loss": 3.0081, + "theoretical_loss": 3.6543088934880723, + "tokens_seen": 985405440 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003542527582748245, + "loss": 2.8993, + "theoretical_loss": 3.6542857026183126, + "tokens_seen": 985470976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035424272818455367, + "loss": 2.8838, + "theoretical_loss": 3.654262513722541, + "tokens_seen": 985536512 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035423269809428285, + "loss": 2.9494, + "theoretical_loss": 3.6542393268004587, + "tokens_seen": 985602048 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035422266800401203, + "loss": 3.1025, + "theoretical_loss": 3.6542161418517667, + "tokens_seen": 985667584 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035421263791374127, + "loss": 2.8928, + "theoretical_loss": 3.654192958876165, + "tokens_seen": 985733120 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003542026078234704, + "loss": 2.9162, + "theoretical_loss": 3.654169777873355, + "tokens_seen": 985798656 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035419257773319963, + "loss": 2.83, + "theoretical_loss": 3.654146598843038, + "tokens_seen": 985864192 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003541825476429288, + "loss": 2.9323, + "theoretical_loss": 3.6541234217849152, + "tokens_seen": 985929728 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354172517552658, + "loss": 2.9363, + "theoretical_loss": 3.654100246698687, + "tokens_seen": 985995264 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003541624874623872, + "loss": 2.9509, + "theoretical_loss": 3.6540770735840553, + "tokens_seen": 986060800 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035415245737211636, + "loss": 2.8356, + "theoretical_loss": 3.6540539024407206, + "tokens_seen": 986126336 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035414242728184554, + "loss": 2.8982, + "theoretical_loss": 3.6540307332683843, + "tokens_seen": 986191872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035413239719157477, + "loss": 2.9063, + "theoretical_loss": 3.6540075660667486, + "tokens_seen": 986257408 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1585914, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.032738447189331, + "objective/train/theoretical_loss": 3.6539959832048496, + "objective/train/tokens_used": 1006750176, + "theoretical_loss": 3.6539959832048496, + "tokens_seen": 986290176 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003541223671013039, + "loss": 3.0229, + "theoretical_loss": 3.6539844008355145, + "tokens_seen": 986322944 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035411233701103314, + "loss": 2.9548, + "theoretical_loss": 3.6539612375743826, + "tokens_seen": 986388480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035410230692076226, + "loss": 2.9568, + "theoretical_loss": 3.653938076283056, + "tokens_seen": 986454016 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003540922768304915, + "loss": 2.9993, + "theoretical_loss": 3.6539149169612353, + "tokens_seen": 986519552 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003540822467402207, + "loss": 2.8855, + "theoretical_loss": 3.653891759608623, + "tokens_seen": 986585088 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035407221664994986, + "loss": 2.9258, + "theoretical_loss": 3.65386860422492, + "tokens_seen": 986650624 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035406218655967904, + "loss": 2.866, + "theoretical_loss": 3.6538454508098286, + "tokens_seen": 986716160 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003540521564694082, + "loss": 2.7182, + "theoretical_loss": 3.6538222993630507, + "tokens_seen": 986781696 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003540421263791374, + "loss": 2.881, + "theoretical_loss": 3.6537991498842883, + "tokens_seen": 986847232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035403209628886664, + "loss": 2.8058, + "theoretical_loss": 3.653776002373243, + "tokens_seen": 986912768 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035402206619859577, + "loss": 2.9572, + "theoretical_loss": 3.653752856829618, + "tokens_seen": 986978304 + }, + { + "epoch": 2.09, + "learning_rate": 0.000354012036108325, + "loss": 2.9771, + "theoretical_loss": 3.6537297132531137, + "tokens_seen": 987043840 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003540020060180542, + "loss": 2.8324, + "theoretical_loss": 3.6537065716434336, + "tokens_seen": 987109376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035399197592778336, + "loss": 2.885, + "theoretical_loss": 3.65368343200028, + "tokens_seen": 987174912 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035398194583751254, + "loss": 2.8747, + "theoretical_loss": 3.6536602943233545, + "tokens_seen": 987240448 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003539719157472417, + "loss": 2.9474, + "theoretical_loss": 3.6536371586123604, + "tokens_seen": 987305984 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003539618856569709, + "loss": 2.9924, + "theoretical_loss": 3.6536140248669993, + "tokens_seen": 987371520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035395185556670014, + "loss": 3.0308, + "theoretical_loss": 3.6535908930869745, + "tokens_seen": 987437056 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035394182547642927, + "loss": 2.9119, + "theoretical_loss": 3.653567763271988, + "tokens_seen": 987502592 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003539317953861585, + "loss": 2.9336, + "theoretical_loss": 3.6535446354217433, + "tokens_seen": 987568128 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035392176529588763, + "loss": 2.9895, + "theoretical_loss": 3.6535215095359423, + "tokens_seen": 987633664 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035391173520561687, + "loss": 2.8551, + "theoretical_loss": 3.653498385614288, + "tokens_seen": 987699200 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035390170511534605, + "loss": 2.9849, + "theoretical_loss": 3.6534752636564836, + "tokens_seen": 987764736 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035389167502507523, + "loss": 2.9676, + "theoretical_loss": 3.653452143662232, + "tokens_seen": 987830272 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003538816449348044, + "loss": 2.8699, + "theoretical_loss": 3.653429025631236, + "tokens_seen": 987895808 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1587538, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.873138427734375, + "objective/train/theoretical_loss": 3.653417467351866, + "objective/train/tokens_used": 1008388576, + "theoretical_loss": 3.653417467351866, + "tokens_seen": 987928576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035387161484453364, + "loss": 2.8846, + "theoretical_loss": 3.6534059095631983, + "tokens_seen": 987961344 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035386158475426277, + "loss": 2.7675, + "theoretical_loss": 3.653382795457823, + "tokens_seen": 988026880 + }, + { + "epoch": 2.09, + "learning_rate": 0.000353851554663992, + "loss": 2.8203, + "theoretical_loss": 3.653359683314812, + "tokens_seen": 988092416 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035384152457372113, + "loss": 2.9751, + "theoretical_loss": 3.6533365731338696, + "tokens_seen": 988157952 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035383149448345037, + "loss": 2.8528, + "theoretical_loss": 3.6533134649146985, + "tokens_seen": 988223488 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035382146439317955, + "loss": 2.917, + "theoretical_loss": 3.653290358657003, + "tokens_seen": 988289024 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035381143430290873, + "loss": 2.7843, + "theoretical_loss": 3.6532672543604856, + "tokens_seen": 988354560 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003538014042126379, + "loss": 2.9411, + "theoretical_loss": 3.65324415202485, + "tokens_seen": 988420096 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537913741223671, + "loss": 2.9313, + "theoretical_loss": 3.6532210516498003, + "tokens_seen": 988485632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537813440320963, + "loss": 2.8637, + "theoretical_loss": 3.65319795323504, + "tokens_seen": 988551168 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537713139418255, + "loss": 2.9914, + "theoretical_loss": 3.6531748567802715, + "tokens_seen": 988616704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035376128385155464, + "loss": 2.9179, + "theoretical_loss": 3.6531517622852006, + "tokens_seen": 988682240 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035375125376128387, + "loss": 2.888, + "theoretical_loss": 3.65312866974953, + "tokens_seen": 988747776 + }, + { + "epoch": 2.09, + "learning_rate": 0.000353741223671013, + "loss": 2.8189, + "theoretical_loss": 3.653105579172964, + "tokens_seen": 988813312 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035373119358074223, + "loss": 2.9857, + "theoretical_loss": 3.6530824905552057, + "tokens_seen": 988878848 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537211634904714, + "loss": 2.9119, + "theoretical_loss": 3.6530594038959605, + "tokens_seen": 988944384 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537111334002006, + "loss": 2.9907, + "theoretical_loss": 3.6530363191949315, + "tokens_seen": 989009920 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003537011033099298, + "loss": 2.9018, + "theoretical_loss": 3.653013236451823, + "tokens_seen": 989075456 + }, + { + "epoch": 2.09, + "learning_rate": 0.000353691073219659, + "loss": 2.9752, + "theoretical_loss": 3.65299015566634, + "tokens_seen": 989140992 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035368104312938814, + "loss": 2.8155, + "theoretical_loss": 3.652967076838186, + "tokens_seen": 989206528 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536710130391174, + "loss": 2.9971, + "theoretical_loss": 3.652943999967065, + "tokens_seen": 989272064 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536609829488465, + "loss": 2.9119, + "theoretical_loss": 3.6529209250526824, + "tokens_seen": 989337600 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035365095285857574, + "loss": 2.9367, + "theoretical_loss": 3.6528978520947417, + "tokens_seen": 989403136 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536409227683049, + "loss": 2.8055, + "theoretical_loss": 3.6528747810929487, + "tokens_seen": 989468672 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536308926780341, + "loss": 2.8564, + "theoretical_loss": 3.652851712047007, + "tokens_seen": 989534208 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1590158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8717777729034424, + "objective/train/theoretical_loss": 3.652840178257388, + "objective/train/tokens_used": 1010026976, + "theoretical_loss": 3.652840178257388, + "tokens_seen": 989566976 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536208625877633, + "loss": 2.7487, + "theoretical_loss": 3.6528286449566214, + "tokens_seen": 989599744 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035361083249749246, + "loss": 2.9091, + "theoretical_loss": 3.652805579821497, + "tokens_seen": 989665280 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003536008024072217, + "loss": 2.9203, + "theoretical_loss": 3.652782516641338, + "tokens_seen": 989730816 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003535907723169509, + "loss": 3.0779, + "theoretical_loss": 3.65275945541585, + "tokens_seen": 989796352 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035358074222668006, + "loss": 2.8337, + "theoretical_loss": 3.652736396144738, + "tokens_seen": 989861888 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035357071213640924, + "loss": 2.754, + "theoretical_loss": 3.6527133388277058, + "tokens_seen": 989927424 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003535606820461384, + "loss": 2.982, + "theoretical_loss": 3.6526902834644597, + "tokens_seen": 989992960 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003535506519558676, + "loss": 2.9456, + "theoretical_loss": 3.6526672300547043, + "tokens_seen": 990058496 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035354062186559684, + "loss": 2.7622, + "theoretical_loss": 3.6526441785981447, + "tokens_seen": 990124032 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035353059177532597, + "loss": 2.8521, + "theoretical_loss": 3.6526211290944866, + "tokens_seen": 990189568 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003535205616850552, + "loss": 2.9379, + "theoretical_loss": 3.6525980815434345, + "tokens_seen": 990255104 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003535105315947844, + "loss": 2.9613, + "theoretical_loss": 3.652575035944695, + "tokens_seen": 990320640 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035350050150451356, + "loss": 3.0291, + "theoretical_loss": 3.6525519922979726, + "tokens_seen": 990386176 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035349047141424274, + "loss": 2.9439, + "theoretical_loss": 3.6525289506029726, + "tokens_seen": 990451712 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003534804413239719, + "loss": 3.0133, + "theoretical_loss": 3.652505910859402, + "tokens_seen": 990517248 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003534704112337011, + "loss": 2.8335, + "theoretical_loss": 3.6524828730669645, + "tokens_seen": 990582784 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035346038114343034, + "loss": 2.9496, + "theoretical_loss": 3.652459837225367, + "tokens_seen": 990648320 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035345035105315947, + "loss": 2.8746, + "theoretical_loss": 3.6524368033343153, + "tokens_seen": 990713856 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003534403209628887, + "loss": 2.9846, + "theoretical_loss": 3.6524137713935145, + "tokens_seen": 990779392 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035343029087261783, + "loss": 2.8805, + "theoretical_loss": 3.6523907414026713, + "tokens_seen": 990844928 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035342026078234707, + "loss": 2.6678, + "theoretical_loss": 3.652367713361491, + "tokens_seen": 990910464 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035341023069207625, + "loss": 2.9604, + "theoretical_loss": 3.6523446872696805, + "tokens_seen": 990976000 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035340020060180543, + "loss": 2.8593, + "theoretical_loss": 3.6523216631269446, + "tokens_seen": 991041536 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003533901705115346, + "loss": 2.9064, + "theoretical_loss": 3.6522986409329903, + "tokens_seen": 991107072 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035338014042126384, + "loss": 2.8992, + "theoretical_loss": 3.652275620687524, + "tokens_seen": 991172608 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1592918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.886983871459961, + "objective/train/theoretical_loss": 3.6522641112953815, + "objective/train/tokens_used": 1011665376, + "theoretical_loss": 3.6522641112953815, + "tokens_seen": 991205376 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035337011033099297, + "loss": 2.8316, + "theoretical_loss": 3.6522526023902513, + "tokens_seen": 991238144 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003533600802407222, + "loss": 3.0016, + "theoretical_loss": 3.652229586040879, + "tokens_seen": 991303680 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035335005015045133, + "loss": 2.8881, + "theoretical_loss": 3.6522065716391134, + "tokens_seen": 991369216 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035334002006018057, + "loss": 2.96, + "theoretical_loss": 3.6521835591846603, + "tokens_seen": 991434752 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035332998996990975, + "loss": 2.8362, + "theoretical_loss": 3.6521605486772275, + "tokens_seen": 991500288 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035331995987963893, + "loss": 2.8955, + "theoretical_loss": 3.652137540116521, + "tokens_seen": 991565824 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003533099297893681, + "loss": 2.8035, + "theoretical_loss": 3.652114533502247, + "tokens_seen": 991631360 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532998996990973, + "loss": 2.7789, + "theoretical_loss": 3.6520915288341125, + "tokens_seen": 991696896 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532898696088265, + "loss": 2.9517, + "theoretical_loss": 3.6520685261118246, + "tokens_seen": 991762432 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532798395185557, + "loss": 2.8398, + "theoretical_loss": 3.65204552533509, + "tokens_seen": 991827968 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035326980942828484, + "loss": 2.9775, + "theoretical_loss": 3.6520225265036155, + "tokens_seen": 991893504 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035325977933801407, + "loss": 2.9263, + "theoretical_loss": 3.651999529617108, + "tokens_seen": 991959040 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532497492477432, + "loss": 2.9238, + "theoretical_loss": 3.651976534675275, + "tokens_seen": 992024576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035323971915747243, + "loss": 2.9808, + "theoretical_loss": 3.6519535416778224, + "tokens_seen": 992090112 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532296890672016, + "loss": 2.9107, + "theoretical_loss": 3.651930550624459, + "tokens_seen": 992155648 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003532196589769308, + "loss": 3.0546, + "theoretical_loss": 3.651907561514891, + "tokens_seen": 992221184 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035320962888666, + "loss": 2.8888, + "theoretical_loss": 3.651884574348826, + "tokens_seen": 992286720 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531995987963892, + "loss": 3.0534, + "theoretical_loss": 3.6518615891259714, + "tokens_seen": 992352256 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035318956870611834, + "loss": 2.8587, + "theoretical_loss": 3.6518386058460344, + "tokens_seen": 992417792 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531795386158476, + "loss": 3.0046, + "theoretical_loss": 3.651815624508722, + "tokens_seen": 992483328 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531695085255767, + "loss": 2.7655, + "theoretical_loss": 3.651792645113743, + "tokens_seen": 992548864 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035315947843530594, + "loss": 2.9153, + "theoretical_loss": 3.651769667660804, + "tokens_seen": 992614400 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531494483450351, + "loss": 3.0223, + "theoretical_loss": 3.6517466921496133, + "tokens_seen": 992679936 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531394182547643, + "loss": 2.9027, + "theoretical_loss": 3.651723718579878, + "tokens_seen": 992745472 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003531293881644935, + "loss": 2.9324, + "theoretical_loss": 3.651700746951306, + "tokens_seen": 992811008 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1595760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8800737857818604, + "objective/train/theoretical_loss": 3.651689261864865, + "objective/train/tokens_used": 1013303776, + "theoretical_loss": 3.651689261864865, + "tokens_seen": 992843776 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035311935807422266, + "loss": 2.6819, + "theoretical_loss": 3.6516777772636058, + "tokens_seen": 992876544 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035310932798395184, + "loss": 2.8167, + "theoretical_loss": 3.6516548095164847, + "tokens_seen": 992942080 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003530992978936811, + "loss": 2.7458, + "theoretical_loss": 3.651631843709651, + "tokens_seen": 993007616 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003530892678034102, + "loss": 2.9104, + "theoretical_loss": 3.6516088798428123, + "tokens_seen": 993073152 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035307923771313944, + "loss": 2.7857, + "theoretical_loss": 3.6515859179156775, + "tokens_seen": 993138688 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035306920762286857, + "loss": 2.7378, + "theoretical_loss": 3.651562957927954, + "tokens_seen": 993204224 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003530591775325978, + "loss": 3.0481, + "theoretical_loss": 3.651539999879351, + "tokens_seen": 993269760 + }, + { + "epoch": 2.09, + "learning_rate": 0.000353049147442327, + "loss": 2.8481, + "theoretical_loss": 3.6515170437695756, + "tokens_seen": 993335296 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035303911735205617, + "loss": 2.9373, + "theoretical_loss": 3.6514940895983368, + "tokens_seen": 993400832 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035302908726178535, + "loss": 2.972, + "theoretical_loss": 3.651471137365343, + "tokens_seen": 993466368 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003530190571715146, + "loss": 2.9689, + "theoretical_loss": 3.6514481870703026, + "tokens_seen": 993531904 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003530090270812437, + "loss": 2.7914, + "theoretical_loss": 3.6514252387129242, + "tokens_seen": 993597440 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035299899699097294, + "loss": 2.9571, + "theoretical_loss": 3.651402292292916, + "tokens_seen": 993662976 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035298896690070207, + "loss": 2.9807, + "theoretical_loss": 3.651379347809988, + "tokens_seen": 993728512 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003529789368104313, + "loss": 2.9265, + "theoretical_loss": 3.6513564052638476, + "tokens_seen": 993794048 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003529689067201605, + "loss": 2.6717, + "theoretical_loss": 3.6513334646542037, + "tokens_seen": 993859584 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035295887662988967, + "loss": 2.8943, + "theoretical_loss": 3.651310525980766, + "tokens_seen": 993925120 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035294884653961885, + "loss": 2.8842, + "theoretical_loss": 3.6512875892432426, + "tokens_seen": 993990656 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035293881644934803, + "loss": 2.9478, + "theoretical_loss": 3.651264654441343, + "tokens_seen": 994056192 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003529287863590772, + "loss": 3.0362, + "theoretical_loss": 3.6512417215747766, + "tokens_seen": 994121728 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035291875626880645, + "loss": 2.937, + "theoretical_loss": 3.6512187906432514, + "tokens_seen": 994187264 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003529087261785356, + "loss": 2.8766, + "theoretical_loss": 3.6511958616464772, + "tokens_seen": 994252800 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003528986960882648, + "loss": 2.9546, + "theoretical_loss": 3.6511729345841637, + "tokens_seen": 994318336 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035288866599799394, + "loss": 2.9653, + "theoretical_loss": 3.651150009456019, + "tokens_seen": 994383872 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035287863590772317, + "loss": 2.8393, + "theoretical_loss": 3.651127086261754, + "tokens_seen": 994449408 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1598565, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1378753185272217, + "objective/train/theoretical_loss": 3.6511156253897346, + "objective/train/tokens_used": 1014942176, + "theoretical_loss": 3.6511156253897346, + "tokens_seen": 994482176 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035286860581745235, + "loss": 2.8473, + "theoretical_loss": 3.651104165001077, + "tokens_seen": 994514944 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035285857572718153, + "loss": 2.7659, + "theoretical_loss": 3.651081245673698, + "tokens_seen": 994580480 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035284854563691077, + "loss": 3.016, + "theoretical_loss": 3.6510583282793263, + "tokens_seen": 994646016 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035283851554663995, + "loss": 2.8586, + "theoretical_loss": 3.651035412817672, + "tokens_seen": 994711552 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035282848545636913, + "loss": 2.9951, + "theoretical_loss": 3.6510124992884436, + "tokens_seen": 994777088 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003528184553660983, + "loss": 2.7522, + "theoretical_loss": 3.6509895876913525, + "tokens_seen": 994842624 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003528084252758275, + "loss": 2.981, + "theoretical_loss": 3.650966678026107, + "tokens_seen": 994908160 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527983951855567, + "loss": 2.9212, + "theoretical_loss": 3.650943770292418, + "tokens_seen": 994973696 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527883650952859, + "loss": 2.9414, + "theoretical_loss": 3.6509208644899953, + "tokens_seen": 995039232 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035277833500501504, + "loss": 2.8026, + "theoretical_loss": 3.6508979606185488, + "tokens_seen": 995104768 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035276830491474427, + "loss": 2.9632, + "theoretical_loss": 3.650875058677788, + "tokens_seen": 995170304 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527582748244734, + "loss": 2.883, + "theoretical_loss": 3.650852158667424, + "tokens_seen": 995235840 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035274824473420263, + "loss": 2.9474, + "theoretical_loss": 3.6508292605871664, + "tokens_seen": 995301376 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527382146439318, + "loss": 2.6837, + "theoretical_loss": 3.6508063644367255, + "tokens_seen": 995366912 + }, + { + "epoch": 2.09, + "learning_rate": 0.000352728184553661, + "loss": 2.8545, + "theoretical_loss": 3.6507834702158113, + "tokens_seen": 995432448 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527181544633902, + "loss": 2.9876, + "theoretical_loss": 3.650760577924135, + "tokens_seen": 995497984 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003527081243731194, + "loss": 2.8824, + "theoretical_loss": 3.6507376875614064, + "tokens_seen": 995563520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035269809428284854, + "loss": 2.9436, + "theoretical_loss": 3.6507147991273357, + "tokens_seen": 995629056 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526880641925778, + "loss": 2.8951, + "theoretical_loss": 3.6506919126216344, + "tokens_seen": 995694592 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526780341023069, + "loss": 2.995, + "theoretical_loss": 3.6506690280440126, + "tokens_seen": 995760128 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035266800401203614, + "loss": 2.9314, + "theoretical_loss": 3.650646145394181, + "tokens_seen": 995825664 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526579739217653, + "loss": 2.6267, + "theoretical_loss": 3.65062326467185, + "tokens_seen": 995891200 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526479438314945, + "loss": 2.8859, + "theoretical_loss": 3.6506003858767313, + "tokens_seen": 995956736 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526379137412237, + "loss": 2.9259, + "theoretical_loss": 3.6505775090085355, + "tokens_seen": 996022272 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035262788365095286, + "loss": 2.973, + "theoretical_loss": 3.650554634066973, + "tokens_seen": 996087808 + }, + { + "epoch": 2.09, + "objective/train/docs_used": 1601234, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6681861877441406, + "objective/train/theoretical_loss": 3.650543197318589, + "objective/train/tokens_used": 1016580576, + "theoretical_loss": 3.650543197318589, + "tokens_seen": 996120576 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035261785356068204, + "loss": 2.8078, + "theoretical_loss": 3.650531761051755, + "tokens_seen": 996153344 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003526078234704113, + "loss": 2.8929, + "theoretical_loss": 3.650508889962593, + "tokens_seen": 996218880 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003525977933801404, + "loss": 2.9583, + "theoretical_loss": 3.6504860207991974, + "tokens_seen": 996284416 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035258776328986964, + "loss": 2.8997, + "theoretical_loss": 3.65046315356128, + "tokens_seen": 996349952 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035257773319959877, + "loss": 3.0242, + "theoretical_loss": 3.650440288248552, + "tokens_seen": 996415488 + }, + { + "epoch": 2.09, + "learning_rate": 0.000352567703109328, + "loss": 2.9801, + "theoretical_loss": 3.6504174248607244, + "tokens_seen": 996481024 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003525576730190572, + "loss": 2.9194, + "theoretical_loss": 3.6503945633975086, + "tokens_seen": 996546560 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035254764292878637, + "loss": 2.983, + "theoretical_loss": 3.6503717038586156, + "tokens_seen": 996612096 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035253761283851555, + "loss": 2.9337, + "theoretical_loss": 3.6503488462437583, + "tokens_seen": 996677632 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003525275827482448, + "loss": 3.0648, + "theoretical_loss": 3.6503259905526475, + "tokens_seen": 996743168 + }, + { + "epoch": 2.09, + "learning_rate": 0.0003525175526579739, + "loss": 2.9099, + "theoretical_loss": 3.650303136784994, + "tokens_seen": 996808704 + }, + { + "epoch": 2.09, + "learning_rate": 0.00035250752256770314, + "loss": 2.9875, + "theoretical_loss": 3.6502802849405107, + "tokens_seen": 996874240 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035249749247743227, + "loss": 2.8927, + "theoretical_loss": 3.6502574350189088, + "tokens_seen": 996939776 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003524874623871615, + "loss": 2.9464, + "theoretical_loss": 3.6502345870199004, + "tokens_seen": 997005312 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003524774322968907, + "loss": 2.8957, + "theoretical_loss": 3.650211740943197, + "tokens_seen": 997070848 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035246740220661987, + "loss": 2.8573, + "theoretical_loss": 3.6501888967885106, + "tokens_seen": 997136384 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035245737211634905, + "loss": 2.7531, + "theoretical_loss": 3.6501660545555534, + "tokens_seen": 997201920 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035244734202607823, + "loss": 2.7932, + "theoretical_loss": 3.650143214244037, + "tokens_seen": 997267456 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003524373119358074, + "loss": 2.7915, + "theoretical_loss": 3.6501203758536738, + "tokens_seen": 997332992 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035242728184553665, + "loss": 2.9436, + "theoretical_loss": 3.6500975393841766, + "tokens_seen": 997398528 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003524172517552658, + "loss": 2.9785, + "theoretical_loss": 3.650074704835257, + "tokens_seen": 997464064 + }, + { + "epoch": 2.1, + "learning_rate": 0.000352407221664995, + "loss": 2.9251, + "theoretical_loss": 3.650051872206627, + "tokens_seen": 997529600 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035239719157472414, + "loss": 3.0219, + "theoretical_loss": 3.6500290414979997, + "tokens_seen": 997595136 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035238716148445337, + "loss": 2.8409, + "theoretical_loss": 3.650006212709087, + "tokens_seen": 997660672 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035237713139418255, + "loss": 2.9391, + "theoretical_loss": 3.649983385839602, + "tokens_seen": 997726208 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1602688, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4537830352783203, + "objective/train/theoretical_loss": 3.649971973124554, + "objective/train/tokens_used": 1018218976, + "theoretical_loss": 3.649971973124554, + "tokens_seen": 997758976 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035236710130391173, + "loss": 2.5616, + "theoretical_loss": 3.649960560889256, + "tokens_seen": 997791744 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003523570712136409, + "loss": 3.0341, + "theoretical_loss": 3.649937737857763, + "tokens_seen": 997857280 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035234704112337015, + "loss": 2.9045, + "theoretical_loss": 3.649914916744835, + "tokens_seen": 997922816 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003523370110330993, + "loss": 3.0334, + "theoretical_loss": 3.6498920975501847, + "tokens_seen": 997988352 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003523269809428285, + "loss": 3.0291, + "theoretical_loss": 3.6498692802735255, + "tokens_seen": 998053888 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035231695085255764, + "loss": 3.0056, + "theoretical_loss": 3.649846464914569, + "tokens_seen": 998119424 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003523069207622869, + "loss": 2.9238, + "theoretical_loss": 3.64982365147303, + "tokens_seen": 998184960 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035229689067201606, + "loss": 2.823, + "theoretical_loss": 3.64980083994862, + "tokens_seen": 998250496 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035228686058174524, + "loss": 2.9742, + "theoretical_loss": 3.6497780303410523, + "tokens_seen": 998316032 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003522768304914744, + "loss": 2.8896, + "theoretical_loss": 3.6497552226500405, + "tokens_seen": 998381568 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003522668004012036, + "loss": 3.0556, + "theoretical_loss": 3.649732416875297, + "tokens_seen": 998447104 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003522567703109328, + "loss": 2.8107, + "theoretical_loss": 3.649709613016536, + "tokens_seen": 998512640 + }, + { + "epoch": 2.1, + "learning_rate": 0.000352246740220662, + "loss": 2.8046, + "theoretical_loss": 3.64968681107347, + "tokens_seen": 998578176 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035223671013039114, + "loss": 2.8589, + "theoretical_loss": 3.649664011045813, + "tokens_seen": 998643712 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003522266800401204, + "loss": 2.9925, + "theoretical_loss": 3.649641212933278, + "tokens_seen": 998709248 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035221664994984956, + "loss": 2.7935, + "theoretical_loss": 3.6496184167355787, + "tokens_seen": 998774784 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035220661985957874, + "loss": 2.8843, + "theoretical_loss": 3.649595622452428, + "tokens_seen": 998840320 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521965897693079, + "loss": 2.8923, + "theoretical_loss": 3.6495728300835397, + "tokens_seen": 998905856 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521865596790371, + "loss": 2.878, + "theoretical_loss": 3.649550039628629, + "tokens_seen": 998971392 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521765295887663, + "loss": 3.0023, + "theoretical_loss": 3.6495272510874077, + "tokens_seen": 999036928 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521664994984955, + "loss": 2.7809, + "theoretical_loss": 3.64950446445959, + "tokens_seen": 999102464 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035215646940822465, + "loss": 3.0532, + "theoretical_loss": 3.6494816797448904, + "tokens_seen": 999168000 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521464393179539, + "loss": 2.7601, + "theoretical_loss": 3.649458896943022, + "tokens_seen": 999233536 + }, + { + "epoch": 2.1, + "learning_rate": 0.000352136409227683, + "loss": 2.8712, + "theoretical_loss": 3.6494361160536997, + "tokens_seen": 999299072 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035212637913741224, + "loss": 2.8906, + "theoretical_loss": 3.649413337076637, + "tokens_seen": 999364608 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1605516, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9119701385498047, + "objective/train/theoretical_loss": 3.6494019483051137, + "objective/train/tokens_used": 1019857376, + "theoretical_loss": 3.6494019483051137, + "tokens_seen": 999397376 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521163490471414, + "loss": 2.932, + "theoretical_loss": 3.6493905600115477, + "tokens_seen": 999430144 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003521063189568706, + "loss": 2.8495, + "theoretical_loss": 3.649367784858147, + "tokens_seen": 999495680 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035209628886659984, + "loss": 2.9667, + "theoretical_loss": 3.6493450116161483, + "tokens_seen": 999561216 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035208625877632897, + "loss": 3.0668, + "theoretical_loss": 3.649322240285265, + "tokens_seen": 999626752 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003520762286860582, + "loss": 2.9959, + "theoretical_loss": 3.6492994708652136, + "tokens_seen": 999692288 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003520661985957874, + "loss": 2.9842, + "theoretical_loss": 3.649276703355707, + "tokens_seen": 999757824 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035205616850551657, + "loss": 2.7546, + "theoretical_loss": 3.64925393775646, + "tokens_seen": 999823360 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035204613841524575, + "loss": 2.7003, + "theoretical_loss": 3.649231174067187, + "tokens_seen": 999888896 + }, + { + "epoch": 2.1, + "learning_rate": 0.000352036108324975, + "loss": 2.9802, + "theoretical_loss": 3.649208412287603, + "tokens_seen": 999954432 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003520260782347041, + "loss": 2.8073, + "theoretical_loss": 3.6491856524174224, + "tokens_seen": 1000019968 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035201604814443334, + "loss": 2.8672, + "theoretical_loss": 3.649162894456359, + "tokens_seen": 1000085504 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035200601805416247, + "loss": 2.8629, + "theoretical_loss": 3.6491401384041295, + "tokens_seen": 1000151040 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003519959879638917, + "loss": 2.8614, + "theoretical_loss": 3.649117384260448, + "tokens_seen": 1000216576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003519859578736209, + "loss": 2.8398, + "theoretical_loss": 3.649094632025028, + "tokens_seen": 1000282112 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035197592778335007, + "loss": 2.9202, + "theoretical_loss": 3.6490718816975862, + "tokens_seen": 1000347648 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035196589769307925, + "loss": 2.921, + "theoretical_loss": 3.649049133277837, + "tokens_seen": 1000413184 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035195586760280843, + "loss": 3.0371, + "theoretical_loss": 3.6490263867654953, + "tokens_seen": 1000478720 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003519458375125376, + "loss": 2.9949, + "theoretical_loss": 3.649003642160276, + "tokens_seen": 1000544256 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035193580742226685, + "loss": 2.9725, + "theoretical_loss": 3.648980899461895, + "tokens_seen": 1000609792 + }, + { + "epoch": 2.1, + "learning_rate": 0.000351925777331996, + "loss": 2.7688, + "theoretical_loss": 3.6489581586700677, + "tokens_seen": 1000675328 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003519157472417252, + "loss": 3.0639, + "theoretical_loss": 3.6489354197845083, + "tokens_seen": 1000740864 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035190571715145434, + "loss": 2.8115, + "theoretical_loss": 3.648912682804933, + "tokens_seen": 1000806400 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035189568706118357, + "loss": 2.8823, + "theoretical_loss": 3.6488899477310577, + "tokens_seen": 1000871936 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035188565697091275, + "loss": 2.9258, + "theoretical_loss": 3.648867214562596, + "tokens_seen": 1000937472 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035187562688064193, + "loss": 2.836, + "theoretical_loss": 3.648844483299266, + "tokens_seen": 1001003008 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1608464, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8444156646728516, + "objective/train/theoretical_loss": 3.648833118381936, + "objective/train/tokens_used": 1021495776, + "theoretical_loss": 3.648833118381936, + "tokens_seen": 1001035776 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003518655967903711, + "loss": 2.8018, + "theoretical_loss": 3.6488217539407817, + "tokens_seen": 1001068544 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035185556670010035, + "loss": 2.7597, + "theoretical_loss": 3.6487990264868593, + "tokens_seen": 1001134080 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003518455366098295, + "loss": 2.8367, + "theoretical_loss": 3.6487763009372145, + "tokens_seen": 1001199616 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003518355065195587, + "loss": 2.9669, + "theoretical_loss": 3.6487535772915627, + "tokens_seen": 1001265152 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035182547642928784, + "loss": 3.0155, + "theoretical_loss": 3.6487308555496205, + "tokens_seen": 1001330688 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003518154463390171, + "loss": 2.8849, + "theoretical_loss": 3.6487081357111033, + "tokens_seen": 1001396224 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035180541624874626, + "loss": 2.9009, + "theoretical_loss": 3.6486854177757273, + "tokens_seen": 1001461760 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035179538615847544, + "loss": 2.8439, + "theoretical_loss": 3.6486627017432083, + "tokens_seen": 1001527296 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003517853560682046, + "loss": 2.9494, + "theoretical_loss": 3.648639987613263, + "tokens_seen": 1001592832 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003517753259779338, + "loss": 2.9064, + "theoretical_loss": 3.648617275385607, + "tokens_seen": 1001658368 + }, + { + "epoch": 2.1, + "learning_rate": 0.000351765295887663, + "loss": 2.9928, + "theoretical_loss": 3.6485945650599567, + "tokens_seen": 1001723904 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003517552657973922, + "loss": 2.8289, + "theoretical_loss": 3.6485718566360283, + "tokens_seen": 1001789440 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035174523570712134, + "loss": 2.9239, + "theoretical_loss": 3.6485491501135385, + "tokens_seen": 1001854976 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003517352056168506, + "loss": 2.9229, + "theoretical_loss": 3.6485264454922035, + "tokens_seen": 1001920512 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035172517552657976, + "loss": 2.9236, + "theoretical_loss": 3.64850374277174, + "tokens_seen": 1001986048 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035171514543630894, + "loss": 2.8017, + "theoretical_loss": 3.6484810419518645, + "tokens_seen": 1002051584 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003517051153460381, + "loss": 2.9021, + "theoretical_loss": 3.648458343032293, + "tokens_seen": 1002117120 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516950852557673, + "loss": 2.9341, + "theoretical_loss": 3.648435646012743, + "tokens_seen": 1002182656 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516850551654965, + "loss": 2.6751, + "theoretical_loss": 3.64841295089293, + "tokens_seen": 1002248192 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516750250752257, + "loss": 2.9826, + "theoretical_loss": 3.6483902576725726, + "tokens_seen": 1002313728 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035166499498495485, + "loss": 2.797, + "theoretical_loss": 3.648367566351386, + "tokens_seen": 1002379264 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516549648946841, + "loss": 2.8804, + "theoretical_loss": 3.6483448769290883, + "tokens_seen": 1002444800 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516449348044132, + "loss": 2.8275, + "theoretical_loss": 3.6483221894053957, + "tokens_seen": 1002510336 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035163490471414244, + "loss": 2.8117, + "theoretical_loss": 3.6482995037800254, + "tokens_seen": 1002575872 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516248746238716, + "loss": 2.8126, + "theoretical_loss": 3.648276820052695, + "tokens_seen": 1002641408 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1611253, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8463926315307617, + "objective/train/theoretical_loss": 3.648265478900706, + "objective/train/tokens_used": 1023134176, + "theoretical_loss": 3.648265478900706, + "tokens_seen": 1002674176 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003516148445336008, + "loss": 2.8969, + "theoretical_loss": 3.6482541382231206, + "tokens_seen": 1002706944 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035160481444333, + "loss": 2.9428, + "theoretical_loss": 3.6482314582910202, + "tokens_seen": 1002772480 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035159478435305917, + "loss": 2.8159, + "theoretical_loss": 3.6482087802561116, + "tokens_seen": 1002838016 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035158475426278835, + "loss": 2.7057, + "theoretical_loss": 3.6481861041181105, + "tokens_seen": 1002903552 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003515747241725176, + "loss": 2.9181, + "theoretical_loss": 3.648163429876736, + "tokens_seen": 1002969088 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003515646940822467, + "loss": 2.8403, + "theoretical_loss": 3.648140757531704, + "tokens_seen": 1003034624 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035155466399197595, + "loss": 2.7521, + "theoretical_loss": 3.648118087082733, + "tokens_seen": 1003100160 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035154463390170513, + "loss": 2.8704, + "theoretical_loss": 3.648095418529541, + "tokens_seen": 1003165696 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003515346038114343, + "loss": 2.7896, + "theoretical_loss": 3.6480727518718443, + "tokens_seen": 1003231232 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003515245737211635, + "loss": 2.7533, + "theoretical_loss": 3.648050087109362, + "tokens_seen": 1003296768 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035151454363089267, + "loss": 2.7756, + "theoretical_loss": 3.6480274242418105, + "tokens_seen": 1003362304 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035150451354062185, + "loss": 3.0422, + "theoretical_loss": 3.6480047632689083, + "tokens_seen": 1003427840 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514944834503511, + "loss": 2.9286, + "theoretical_loss": 3.6479821041903735, + "tokens_seen": 1003493376 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514844533600802, + "loss": 2.8224, + "theoretical_loss": 3.647959447005924, + "tokens_seen": 1003558912 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035147442326980945, + "loss": 2.9814, + "theoretical_loss": 3.647936791715277, + "tokens_seen": 1003624448 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514643931795386, + "loss": 2.8736, + "theoretical_loss": 3.6479141383181517, + "tokens_seen": 1003689984 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514543630892678, + "loss": 2.9953, + "theoretical_loss": 3.647891486814265, + "tokens_seen": 1003755520 + }, + { + "epoch": 2.1, + "learning_rate": 0.000351444332998997, + "loss": 2.945, + "theoretical_loss": 3.647868837203336, + "tokens_seen": 1003821056 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514343029087262, + "loss": 2.92, + "theoretical_loss": 3.647846189485083, + "tokens_seen": 1003886592 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035142427281845536, + "loss": 2.8768, + "theoretical_loss": 3.6478235436592237, + "tokens_seen": 1003952128 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035141424272818454, + "loss": 2.7271, + "theoretical_loss": 3.6478008997254765, + "tokens_seen": 1004017664 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003514042126379137, + "loss": 2.9312, + "theoretical_loss": 3.6477782576835605, + "tokens_seen": 1004083200 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035139418254764295, + "loss": 2.9223, + "theoretical_loss": 3.6477556175331936, + "tokens_seen": 1004148736 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513841524573721, + "loss": 2.8098, + "theoretical_loss": 3.6477329792740942, + "tokens_seen": 1004214272 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513741223671013, + "loss": 2.8682, + "theoretical_loss": 3.647710342905982, + "tokens_seen": 1004279808 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.353248119354248, + "objective/train/theoretical_loss": 3.647699025430957, + "objective/train/tokens_used": 1024772576, + "theoretical_loss": 3.647699025430957, + "tokens_seen": 1004312576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513640922768305, + "loss": 2.7651, + "theoretical_loss": 3.647687708428574, + "tokens_seen": 1004345344 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513540621865597, + "loss": 2.909, + "theoretical_loss": 3.64766507584159, + "tokens_seen": 1004410880 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513440320962889, + "loss": 2.9393, + "theoretical_loss": 3.647642445144748, + "tokens_seen": 1004476416 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035133400200601804, + "loss": 2.9174, + "theoretical_loss": 3.6476198163377678, + "tokens_seen": 1004541952 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003513239719157473, + "loss": 2.8652, + "theoretical_loss": 3.647597189420368, + "tokens_seen": 1004607488 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035131394182547646, + "loss": 3.0058, + "theoretical_loss": 3.6475745643922677, + "tokens_seen": 1004673024 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035130391173520564, + "loss": 2.6772, + "theoretical_loss": 3.6475519412531856, + "tokens_seen": 1004738560 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512938816449348, + "loss": 2.8265, + "theoretical_loss": 3.6475293200028402, + "tokens_seen": 1004804096 + }, + { + "epoch": 2.1, + "learning_rate": 0.000351283851554664, + "loss": 2.9818, + "theoretical_loss": 3.647506700640952, + "tokens_seen": 1004869632 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512738214643932, + "loss": 2.7924, + "theoretical_loss": 3.647484083167239, + "tokens_seen": 1004935168 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512637913741224, + "loss": 2.8645, + "theoretical_loss": 3.647461467581421, + "tokens_seen": 1005000704 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035125376128385154, + "loss": 2.8471, + "theoretical_loss": 3.647438853883217, + "tokens_seen": 1005066240 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512437311935808, + "loss": 2.8278, + "theoretical_loss": 3.6474162420723473, + "tokens_seen": 1005131776 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035123370110330996, + "loss": 3.0095, + "theoretical_loss": 3.64739363214853, + "tokens_seen": 1005197312 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035122367101303914, + "loss": 2.992, + "theoretical_loss": 3.6473710241114863, + "tokens_seen": 1005262848 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512136409227683, + "loss": 2.8275, + "theoretical_loss": 3.6473484179609343, + "tokens_seen": 1005328384 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003512036108324975, + "loss": 2.9577, + "theoretical_loss": 3.6473258136965936, + "tokens_seen": 1005393920 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511935807422267, + "loss": 2.8786, + "theoretical_loss": 3.647303211318185, + "tokens_seen": 1005459456 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511835506519559, + "loss": 2.9176, + "theoretical_loss": 3.647280610825427, + "tokens_seen": 1005524992 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035117352056168505, + "loss": 2.9378, + "theoretical_loss": 3.6472580122180407, + "tokens_seen": 1005590528 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511634904714143, + "loss": 2.8933, + "theoretical_loss": 3.6472354154957447, + "tokens_seen": 1005656064 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511534603811434, + "loss": 2.8701, + "theoretical_loss": 3.6472128206582597, + "tokens_seen": 1005721600 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035114343029087264, + "loss": 2.8654, + "theoretical_loss": 3.6471902277053054, + "tokens_seen": 1005787136 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511334002006018, + "loss": 2.9295, + "theoretical_loss": 3.6471676366366017, + "tokens_seen": 1005852672 + }, + { + "epoch": 2.1, + "learning_rate": 0.000351123370110331, + "loss": 2.9495, + "theoretical_loss": 3.647145047451869, + "tokens_seen": 1005918208 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9857630729675293, + "objective/train/theoretical_loss": 3.647133753565905, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.647133753565905, + "tokens_seen": 1005950976 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003511133400200602, + "loss": 2.8655, + "theoretical_loss": 3.647122460150828, + "tokens_seen": 1005983744 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035110330992978937, + "loss": 2.9925, + "theoretical_loss": 3.6470998747331977, + "tokens_seen": 1006049280 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035109327983951855, + "loss": 2.9697, + "theoretical_loss": 3.6470772911986993, + "tokens_seen": 1006114816 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003510832497492478, + "loss": 2.9675, + "theoretical_loss": 3.647054709547053, + "tokens_seen": 1006180352 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003510732196589769, + "loss": 2.8758, + "theoretical_loss": 3.647032129777978, + "tokens_seen": 1006245888 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035106318956870615, + "loss": 2.6726, + "theoretical_loss": 3.647009551891197, + "tokens_seen": 1006311424 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035105315947843533, + "loss": 2.9475, + "theoretical_loss": 3.6469869758864286, + "tokens_seen": 1006376960 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003510431293881645, + "loss": 2.9221, + "theoretical_loss": 3.646964401763394, + "tokens_seen": 1006442496 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003510330992978937, + "loss": 2.7613, + "theoretical_loss": 3.646941829521814, + "tokens_seen": 1006508032 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035102306920762287, + "loss": 2.7292, + "theoretical_loss": 3.6469192591614092, + "tokens_seen": 1006573568 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035101303911735205, + "loss": 2.9736, + "theoretical_loss": 3.646896690681901, + "tokens_seen": 1006639104 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003510030090270813, + "loss": 2.7927, + "theoretical_loss": 3.646874124083009, + "tokens_seen": 1006704640 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003509929789368104, + "loss": 2.9743, + "theoretical_loss": 3.6468515593644546, + "tokens_seen": 1006770176 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035098294884653965, + "loss": 2.9689, + "theoretical_loss": 3.646828996525959, + "tokens_seen": 1006835712 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003509729187562688, + "loss": 2.8973, + "theoretical_loss": 3.6468064355672434, + "tokens_seen": 1006901248 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350962888665998, + "loss": 3.017, + "theoretical_loss": 3.6467838764880276, + "tokens_seen": 1006966784 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003509528585757272, + "loss": 2.911, + "theoretical_loss": 3.646761319288034, + "tokens_seen": 1007032320 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003509428284854564, + "loss": 2.8827, + "theoretical_loss": 3.6467387639669835, + "tokens_seen": 1007097856 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035093279839518556, + "loss": 2.9198, + "theoretical_loss": 3.646716210524597, + "tokens_seen": 1007163392 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035092276830491474, + "loss": 2.7774, + "theoretical_loss": 3.6466936589605963, + "tokens_seen": 1007228928 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003509127382146439, + "loss": 2.9244, + "theoretical_loss": 3.6466711092747017, + "tokens_seen": 1007294464 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035090270812437315, + "loss": 2.8224, + "theoretical_loss": 3.6466485614666357, + "tokens_seen": 1007360000 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508926780341023, + "loss": 3.023, + "theoretical_loss": 3.64662601553612, + "tokens_seen": 1007425536 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508826479438315, + "loss": 2.8304, + "theoretical_loss": 3.646603471482875, + "tokens_seen": 1007491072 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508726178535607, + "loss": 2.7522, + "theoretical_loss": 3.646580929306623, + "tokens_seen": 1007556608 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.131194829940796, + "objective/train/theoretical_loss": 3.646569658922282, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.646569658922282, + "tokens_seen": 1007589376 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508625877632899, + "loss": 2.9871, + "theoretical_loss": 3.646558389007085, + "tokens_seen": 1007622144 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035085255767301906, + "loss": 2.8232, + "theoretical_loss": 3.6465358505839838, + "tokens_seen": 1007687680 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035084252758274824, + "loss": 2.6964, + "theoretical_loss": 3.6465133140370405, + "tokens_seen": 1007753216 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508324974924774, + "loss": 2.7808, + "theoretical_loss": 3.6464907793659767, + "tokens_seen": 1007818752 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035082246740220666, + "loss": 2.9136, + "theoretical_loss": 3.6464682465705147, + "tokens_seen": 1007884288 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003508124373119358, + "loss": 2.8781, + "theoretical_loss": 3.6464457156503762, + "tokens_seen": 1007949824 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350802407221665, + "loss": 2.7504, + "theoretical_loss": 3.6464231866052836, + "tokens_seen": 1008015360 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035079237713139415, + "loss": 3.0345, + "theoretical_loss": 3.646400659434959, + "tokens_seen": 1008080896 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003507823470411234, + "loss": 2.9732, + "theoretical_loss": 3.6463781341391233, + "tokens_seen": 1008146432 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035077231695085256, + "loss": 2.8442, + "theoretical_loss": 3.6463556107175004, + "tokens_seen": 1008211968 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035076228686058174, + "loss": 3.0404, + "theoretical_loss": 3.6463330891698114, + "tokens_seen": 1008277504 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003507522567703109, + "loss": 2.7264, + "theoretical_loss": 3.646310569495779, + "tokens_seen": 1008343040 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035074222668004016, + "loss": 2.9967, + "theoretical_loss": 3.6462880516951257, + "tokens_seen": 1008408576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003507321965897693, + "loss": 3.0442, + "theoretical_loss": 3.646265535767574, + "tokens_seen": 1008474112 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003507221664994985, + "loss": 2.9602, + "theoretical_loss": 3.646243021712846, + "tokens_seen": 1008539648 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035071213640922765, + "loss": 2.9283, + "theoretical_loss": 3.646220509530664, + "tokens_seen": 1008605184 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003507021063189569, + "loss": 2.8708, + "theoretical_loss": 3.646197999220751, + "tokens_seen": 1008670720 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035069207622868607, + "loss": 2.8982, + "theoretical_loss": 3.64617549078283, + "tokens_seen": 1008736256 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035068204613841525, + "loss": 2.9686, + "theoretical_loss": 3.6461529842166227, + "tokens_seen": 1008801792 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035067201604814443, + "loss": 3.0062, + "theoretical_loss": 3.6461304795218528, + "tokens_seen": 1008867328 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003506619859578736, + "loss": 2.8665, + "theoretical_loss": 3.6461079766982434, + "tokens_seen": 1008932864 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003506519558676028, + "loss": 2.8666, + "theoretical_loss": 3.6460854757455166, + "tokens_seen": 1008998400 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350641925777332, + "loss": 2.8393, + "theoretical_loss": 3.646062976663395, + "tokens_seen": 1009063936 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035063189568706115, + "loss": 2.8419, + "theoretical_loss": 3.646040479451603, + "tokens_seen": 1009129472 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003506218655967904, + "loss": 2.7937, + "theoretical_loss": 3.6460179841098626, + "tokens_seen": 1009195008 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.078022003173828, + "objective/train/theoretical_loss": 3.6460067371401754, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6460067371401754, + "tokens_seen": 1009227776 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003506118355065195, + "loss": 2.8782, + "theoretical_loss": 3.645995490637897, + "tokens_seen": 1009260544 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035060180541624875, + "loss": 2.8212, + "theoretical_loss": 3.64597299903543, + "tokens_seen": 1009326080 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350591775325978, + "loss": 2.7953, + "theoretical_loss": 3.645950509302184, + "tokens_seen": 1009391616 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003505817452357071, + "loss": 2.9199, + "theoretical_loss": 3.645928021437883, + "tokens_seen": 1009457152 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035057171514543635, + "loss": 2.8117, + "theoretical_loss": 3.6459055354422505, + "tokens_seen": 1009522688 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035056168505516553, + "loss": 2.9925, + "theoretical_loss": 3.6458830513150087, + "tokens_seen": 1009588224 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003505516549648947, + "loss": 3.0128, + "theoretical_loss": 3.6458605690558823, + "tokens_seen": 1009653760 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003505416248746239, + "loss": 2.8732, + "theoretical_loss": 3.645838088664595, + "tokens_seen": 1009719296 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035053159478435307, + "loss": 3.0395, + "theoretical_loss": 3.645815610140869, + "tokens_seen": 1009784832 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035052156469408225, + "loss": 3.0132, + "theoretical_loss": 3.64579313348443, + "tokens_seen": 1009850368 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003505115346038115, + "loss": 3.0525, + "theoretical_loss": 3.6457706586949996, + "tokens_seen": 1009915904 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003505015045135406, + "loss": 2.9358, + "theoretical_loss": 3.6457481857723026, + "tokens_seen": 1009981440 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035049147442326985, + "loss": 3.0148, + "theoretical_loss": 3.6457257147160633, + "tokens_seen": 1010046976 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350481444332999, + "loss": 2.8613, + "theoretical_loss": 3.645703245526005, + "tokens_seen": 1010112512 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003504714142427282, + "loss": 2.9476, + "theoretical_loss": 3.645680778201851, + "tokens_seen": 1010178048 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003504613841524574, + "loss": 2.9597, + "theoretical_loss": 3.6456583127433264, + "tokens_seen": 1010243584 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003504513540621866, + "loss": 2.9071, + "theoretical_loss": 3.6456358491501555, + "tokens_seen": 1010309120 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035044132397191576, + "loss": 2.9955, + "theoretical_loss": 3.6456133874220615, + "tokens_seen": 1010374656 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035043129388164494, + "loss": 2.7977, + "theoretical_loss": 3.645590927558769, + "tokens_seen": 1010440192 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003504212637913741, + "loss": 3.0587, + "theoretical_loss": 3.6455684695600015, + "tokens_seen": 1010505728 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035041123370110335, + "loss": 2.7913, + "theoretical_loss": 3.6455460134254847, + "tokens_seen": 1010571264 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003504012036108325, + "loss": 2.8651, + "theoretical_loss": 3.645523559154942, + "tokens_seen": 1010636800 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003503911735205617, + "loss": 2.8282, + "theoretical_loss": 3.6455011067480982, + "tokens_seen": 1010702336 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003503811434302909, + "loss": 2.6645, + "theoretical_loss": 3.645478656204678, + "tokens_seen": 1010767872 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003503711133400201, + "loss": 3.0159, + "theoretical_loss": 3.645456207524405, + "tokens_seen": 1010833408 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.836188554763794, + "objective/train/theoretical_loss": 3.6454449838828626, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6454449838828626, + "tokens_seen": 1010866176 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035036108324974926, + "loss": 2.8842, + "theoretical_loss": 3.6454337607070046, + "tokens_seen": 1010898944 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035035105315947844, + "loss": 3.0142, + "theoretical_loss": 3.645411315752201, + "tokens_seen": 1010964480 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003503410230692076, + "loss": 2.9133, + "theoretical_loss": 3.6453888726597192, + "tokens_seen": 1011030016 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035033099297893686, + "loss": 2.7737, + "theoretical_loss": 3.6453664314292844, + "tokens_seen": 1011095552 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350320962888666, + "loss": 2.8666, + "theoretical_loss": 3.6453439920606208, + "tokens_seen": 1011161088 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003503109327983952, + "loss": 2.8963, + "theoretical_loss": 3.6453215545534534, + "tokens_seen": 1011226624 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035030090270812435, + "loss": 2.8193, + "theoretical_loss": 3.645299118907507, + "tokens_seen": 1011292160 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003502908726178536, + "loss": 2.933, + "theoretical_loss": 3.645276685122507, + "tokens_seen": 1011357696 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035028084252758276, + "loss": 2.8995, + "theoretical_loss": 3.6452542531981784, + "tokens_seen": 1011423232 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035027081243731194, + "loss": 2.9324, + "theoretical_loss": 3.6452318231342464, + "tokens_seen": 1011488768 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003502607823470411, + "loss": 2.9195, + "theoretical_loss": 3.645209394930436, + "tokens_seen": 1011554304 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035025075225677036, + "loss": 2.8096, + "theoretical_loss": 3.645186968586472, + "tokens_seen": 1011619840 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003502407221664995, + "loss": 2.9187, + "theoretical_loss": 3.6451645441020806, + "tokens_seen": 1011685376 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003502306920762287, + "loss": 2.7603, + "theoretical_loss": 3.6451421214769866, + "tokens_seen": 1011750912 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035022066198595785, + "loss": 2.8539, + "theoretical_loss": 3.6451197007109153, + "tokens_seen": 1011816448 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003502106318956871, + "loss": 2.9142, + "theoretical_loss": 3.6450972818035927, + "tokens_seen": 1011881984 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035020060180541627, + "loss": 2.9825, + "theoretical_loss": 3.6450748647547444, + "tokens_seen": 1011947520 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035019057171514545, + "loss": 3.0291, + "theoretical_loss": 3.645052449564095, + "tokens_seen": 1012013056 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035018054162487463, + "loss": 2.9935, + "theoretical_loss": 3.645030036231371, + "tokens_seen": 1012078592 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003501705115346038, + "loss": 2.8448, + "theoretical_loss": 3.645007624756298, + "tokens_seen": 1012144128 + }, + { + "epoch": 2.1, + "learning_rate": 0.000350160481444333, + "loss": 2.8918, + "theoretical_loss": 3.6449852151386013, + "tokens_seen": 1012209664 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003501504513540622, + "loss": 2.9791, + "theoretical_loss": 3.6449628073780076, + "tokens_seen": 1012275200 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035014042126379135, + "loss": 2.93, + "theoretical_loss": 3.644940401474242, + "tokens_seen": 1012340736 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003501303911735206, + "loss": 2.8463, + "theoretical_loss": 3.6449179974270307, + "tokens_seen": 1012406272 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003501203610832497, + "loss": 2.8375, + "theoretical_loss": 3.6448955952360995, + "tokens_seen": 1012471808 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9342548847198486, + "objective/train/theoretical_loss": 3.6448843948366534, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6448843948366534, + "tokens_seen": 1012504576 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035011033099297895, + "loss": 2.9114, + "theoretical_loss": 3.644873194901175, + "tokens_seen": 1012537344 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035010030090270813, + "loss": 2.937, + "theoretical_loss": 3.644850796421983, + "tokens_seen": 1012602880 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003500902708124373, + "loss": 2.9659, + "theoretical_loss": 3.6448283997982496, + "tokens_seen": 1012668416 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003500802407221665, + "loss": 2.9075, + "theoretical_loss": 3.644806005029701, + "tokens_seen": 1012733952 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035007021063189573, + "loss": 2.9214, + "theoretical_loss": 3.644783612116064, + "tokens_seen": 1012799488 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035006018054162486, + "loss": 2.9317, + "theoretical_loss": 3.644761221057064, + "tokens_seen": 1012865024 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003500501504513541, + "loss": 2.8654, + "theoretical_loss": 3.6447388318524285, + "tokens_seen": 1012930560 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003500401203610832, + "loss": 2.8917, + "theoretical_loss": 3.644716444501883, + "tokens_seen": 1012996096 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035003009027081245, + "loss": 2.902, + "theoretical_loss": 3.644694059005155, + "tokens_seen": 1013061632 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035002006018054163, + "loss": 2.8231, + "theoretical_loss": 3.6446716753619697, + "tokens_seen": 1013127168 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003500100300902708, + "loss": 2.9246, + "theoretical_loss": 3.644649293572055, + "tokens_seen": 1013192704 + }, + { + "epoch": 2.1, + "learning_rate": 0.00035, + "loss": 2.9279, + "theoretical_loss": 3.644626913635137, + "tokens_seen": 1013258240 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499899699097292, + "loss": 2.7898, + "theoretical_loss": 3.6446045355509424, + "tokens_seen": 1013323776 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034997993981945836, + "loss": 2.7704, + "theoretical_loss": 3.644582159319199, + "tokens_seen": 1013389312 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499699097291876, + "loss": 2.7183, + "theoretical_loss": 3.6445597849396325, + "tokens_seen": 1013454848 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499598796389167, + "loss": 2.9436, + "theoretical_loss": 3.64453741241197, + "tokens_seen": 1013520384 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034994984954864596, + "loss": 2.8785, + "theoretical_loss": 3.6445150417359393, + "tokens_seen": 1013585920 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499398194583751, + "loss": 2.8615, + "theoretical_loss": 3.6444926729112663, + "tokens_seen": 1013651456 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499297893681043, + "loss": 2.8464, + "theoretical_loss": 3.644470305937679, + "tokens_seen": 1013716992 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499197592778335, + "loss": 2.8954, + "theoretical_loss": 3.6444479408149038, + "tokens_seen": 1013782528 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003499097291875627, + "loss": 2.8621, + "theoretical_loss": 3.644425577542669, + "tokens_seen": 1013848064 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034989969909729186, + "loss": 2.7351, + "theoretical_loss": 3.6444032161207005, + "tokens_seen": 1013913600 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498896690070211, + "loss": 2.9352, + "theoretical_loss": 3.644380856548727, + "tokens_seen": 1013979136 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498796389167502, + "loss": 2.849, + "theoretical_loss": 3.644358498826475, + "tokens_seen": 1014044672 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034986960882647946, + "loss": 2.9119, + "theoretical_loss": 3.644336142953672, + "tokens_seen": 1014110208 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9820199012756348, + "objective/train/theoretical_loss": 3.644324965710729, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.644324965710729, + "tokens_seen": 1014142976 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498595787362086, + "loss": 2.9102, + "theoretical_loss": 3.644313788930046, + "tokens_seen": 1014175744 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498495486459378, + "loss": 2.8984, + "theoretical_loss": 3.644291436755324, + "tokens_seen": 1014241280 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034983951855566706, + "loss": 2.7142, + "theoretical_loss": 3.644269086429234, + "tokens_seen": 1014306816 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498294884653962, + "loss": 2.8349, + "theoretical_loss": 3.6442467379515033, + "tokens_seen": 1014372352 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003498194583751254, + "loss": 3.0337, + "theoretical_loss": 3.6442243913218606, + "tokens_seen": 1014437888 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034980942828485455, + "loss": 2.9947, + "theoretical_loss": 3.644202046540032, + "tokens_seen": 1014503424 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003497993981945838, + "loss": 2.7804, + "theoretical_loss": 3.6441797036057477, + "tokens_seen": 1014568960 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034978936810431296, + "loss": 2.918, + "theoretical_loss": 3.644157362518733, + "tokens_seen": 1014634496 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034977933801404214, + "loss": 2.8904, + "theoretical_loss": 3.644135023278718, + "tokens_seen": 1014700032 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003497693079237713, + "loss": 2.834, + "theoretical_loss": 3.64411268588543, + "tokens_seen": 1014765568 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034975927783350056, + "loss": 2.9007, + "theoretical_loss": 3.6440903503385966, + "tokens_seen": 1014831104 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003497492477432297, + "loss": 2.8928, + "theoretical_loss": 3.644068016637946, + "tokens_seen": 1014896640 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003497392176529589, + "loss": 2.8311, + "theoretical_loss": 3.644045684783207, + "tokens_seen": 1014962176 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034972918756268805, + "loss": 2.6375, + "theoretical_loss": 3.6440233547741077, + "tokens_seen": 1015027712 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003497191574724173, + "loss": 2.9055, + "theoretical_loss": 3.6440010266103755, + "tokens_seen": 1015093248 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034970912738214647, + "loss": 2.9157, + "theoretical_loss": 3.6439787002917408, + "tokens_seen": 1015158784 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034969909729187565, + "loss": 2.8823, + "theoretical_loss": 3.64395637581793, + "tokens_seen": 1015224320 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034968906720160483, + "loss": 2.8383, + "theoretical_loss": 3.6439340531886724, + "tokens_seen": 1015289856 + }, + { + "epoch": 2.1, + "learning_rate": 0.000349679037111334, + "loss": 2.8889, + "theoretical_loss": 3.6439117324036965, + "tokens_seen": 1015355392 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003496690070210632, + "loss": 2.6885, + "theoretical_loss": 3.6438894134627313, + "tokens_seen": 1015420928 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003496589769307924, + "loss": 2.8046, + "theoretical_loss": 3.643867096365504, + "tokens_seen": 1015486464 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034964894684052155, + "loss": 2.9424, + "theoretical_loss": 3.6438447811117456, + "tokens_seen": 1015552000 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003496389167502508, + "loss": 2.7576, + "theoretical_loss": 3.6438224677011832, + "tokens_seen": 1015617536 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003496288866599799, + "loss": 2.8541, + "theoretical_loss": 3.6438001561335462, + "tokens_seen": 1015683072 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034961885656970915, + "loss": 2.848, + "theoretical_loss": 3.6437778464085637, + "tokens_seen": 1015748608 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.845590829849243, + "objective/train/theoretical_loss": 3.6437666922369827, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6437666922369827, + "tokens_seen": 1015781376 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034960882647943833, + "loss": 2.9065, + "theoretical_loss": 3.643755538525964, + "tokens_seen": 1015814144 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003495987963891675, + "loss": 2.9564, + "theoretical_loss": 3.643733232485476, + "tokens_seen": 1015879680 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003495887662988967, + "loss": 2.8311, + "theoretical_loss": 3.6437109282868296, + "tokens_seen": 1015945216 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034957873620862593, + "loss": 2.943, + "theoretical_loss": 3.6436886259297534, + "tokens_seen": 1016010752 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034956870611835506, + "loss": 2.8841, + "theoretical_loss": 3.643666325413977, + "tokens_seen": 1016076288 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003495586760280843, + "loss": 2.8605, + "theoretical_loss": 3.6436440267392287, + "tokens_seen": 1016141824 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003495486459378134, + "loss": 3.0397, + "theoretical_loss": 3.643621729905239, + "tokens_seen": 1016207360 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034953861584754265, + "loss": 2.8675, + "theoretical_loss": 3.6435994349117364, + "tokens_seen": 1016272896 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034952858575727183, + "loss": 2.7721, + "theoretical_loss": 3.643577141758451, + "tokens_seen": 1016338432 + }, + { + "epoch": 2.1, + "learning_rate": 0.000349518555667001, + "loss": 2.9128, + "theoretical_loss": 3.643554850445111, + "tokens_seen": 1016403968 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003495085255767302, + "loss": 2.9981, + "theoretical_loss": 3.6435325609714475, + "tokens_seen": 1016469504 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494984954864594, + "loss": 2.8148, + "theoretical_loss": 3.6435102733371894, + "tokens_seen": 1016535040 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034948846539618856, + "loss": 2.9241, + "theoretical_loss": 3.6434879875420654, + "tokens_seen": 1016600576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494784353059178, + "loss": 2.9064, + "theoretical_loss": 3.6434657035858065, + "tokens_seen": 1016666112 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494684052156469, + "loss": 3.0356, + "theoretical_loss": 3.643443421468142, + "tokens_seen": 1016731648 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034945837512537616, + "loss": 2.6495, + "theoretical_loss": 3.643421141188802, + "tokens_seen": 1016797184 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494483450351053, + "loss": 3.0071, + "theoretical_loss": 3.643398862747516, + "tokens_seen": 1016862720 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494383149448345, + "loss": 2.9994, + "theoretical_loss": 3.6433765861440137, + "tokens_seen": 1016928256 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494282848545637, + "loss": 2.7001, + "theoretical_loss": 3.6433543113780256, + "tokens_seen": 1016993792 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003494182547642929, + "loss": 2.9986, + "theoretical_loss": 3.6433320384492816, + "tokens_seen": 1017059328 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034940822467402206, + "loss": 2.8809, + "theoretical_loss": 3.643309767357511, + "tokens_seen": 1017124864 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003493981945837513, + "loss": 2.8605, + "theoretical_loss": 3.6432874981024455, + "tokens_seen": 1017190400 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003493881644934804, + "loss": 2.8728, + "theoretical_loss": 3.643265230683814, + "tokens_seen": 1017255936 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034937813440320966, + "loss": 3.018, + "theoretical_loss": 3.643242965101347, + "tokens_seen": 1017321472 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003493681043129388, + "loss": 2.7799, + "theoretical_loss": 3.6432207013547755, + "tokens_seen": 1017387008 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7187788486480713, + "objective/train/theoretical_loss": 3.643209570169866, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.643209570169866, + "tokens_seen": 1017419776 + }, + { + "epoch": 2.1, + "learning_rate": 0.000349358074222668, + "loss": 2.5878, + "theoretical_loss": 3.6431984394438293, + "tokens_seen": 1017452544 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003493480441323972, + "loss": 2.8775, + "theoretical_loss": 3.643176179368239, + "tokens_seen": 1017518080 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003493380140421264, + "loss": 2.8734, + "theoretical_loss": 3.643153921127735, + "tokens_seen": 1017583616 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034932798395185557, + "loss": 2.7146, + "theoretical_loss": 3.6431316647220475, + "tokens_seen": 1017649152 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034931795386158475, + "loss": 2.8918, + "theoretical_loss": 3.643109410150908, + "tokens_seen": 1017714688 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034930792377131393, + "loss": 3.0091, + "theoretical_loss": 3.643087157414046, + "tokens_seen": 1017780224 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034929789368104316, + "loss": 2.9985, + "theoretical_loss": 3.643064906511193, + "tokens_seen": 1017845760 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003492878635907723, + "loss": 2.7275, + "theoretical_loss": 3.6430426574420802, + "tokens_seen": 1017911296 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003492778335005015, + "loss": 2.8909, + "theoretical_loss": 3.6430204102064376, + "tokens_seen": 1017976832 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034926780341023065, + "loss": 2.9601, + "theoretical_loss": 3.642998164803996, + "tokens_seen": 1018042368 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003492577733199599, + "loss": 2.7421, + "theoretical_loss": 3.6429759212344868, + "tokens_seen": 1018107904 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034924774322968907, + "loss": 2.9079, + "theoretical_loss": 3.6429536794976416, + "tokens_seen": 1018173440 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034923771313941825, + "loss": 3.0467, + "theoretical_loss": 3.64293143959319, + "tokens_seen": 1018238976 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034922768304914743, + "loss": 2.8847, + "theoretical_loss": 3.642909201520864, + "tokens_seen": 1018304512 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034921765295887667, + "loss": 2.9826, + "theoretical_loss": 3.642886965280395, + "tokens_seen": 1018370048 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003492076228686058, + "loss": 2.7269, + "theoretical_loss": 3.6428647308715134, + "tokens_seen": 1018435584 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034919759277833503, + "loss": 2.9526, + "theoretical_loss": 3.642842498293951, + "tokens_seen": 1018501120 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034918756268806416, + "loss": 2.948, + "theoretical_loss": 3.6428202675474393, + "tokens_seen": 1018566656 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003491775325977934, + "loss": 2.9175, + "theoretical_loss": 3.6427980386317094, + "tokens_seen": 1018632192 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034916750250752257, + "loss": 2.9169, + "theoretical_loss": 3.6427758115464925, + "tokens_seen": 1018697728 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034915747241725175, + "loss": 2.7227, + "theoretical_loss": 3.642753586291521, + "tokens_seen": 1018763264 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034914744232698093, + "loss": 2.7001, + "theoretical_loss": 3.642731362866526, + "tokens_seen": 1018828800 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003491374122367101, + "loss": 2.742, + "theoretical_loss": 3.6427091412712387, + "tokens_seen": 1018894336 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003491273821464393, + "loss": 2.7765, + "theoretical_loss": 3.6426869215053914, + "tokens_seen": 1018959872 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034911735205616853, + "loss": 2.6278, + "theoretical_loss": 3.642664703568715, + "tokens_seen": 1019025408 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.163722038269043, + "objective/train/theoretical_loss": 3.642653595286233, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.642653595286233, + "tokens_seen": 1019058176 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034910732196589766, + "loss": 3.0116, + "theoretical_loss": 3.6426424874609427, + "tokens_seen": 1019090944 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490972918756269, + "loss": 2.9828, + "theoretical_loss": 3.642620273181805, + "tokens_seen": 1019156480 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034908726178535613, + "loss": 2.9913, + "theoretical_loss": 3.6425980607310344, + "tokens_seen": 1019222016 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034907723169508526, + "loss": 2.8406, + "theoretical_loss": 3.642575850108363, + "tokens_seen": 1019287552 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490672016048145, + "loss": 3.0158, + "theoretical_loss": 3.6425536413135227, + "tokens_seen": 1019353088 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490571715145436, + "loss": 2.8636, + "theoretical_loss": 3.642531434346245, + "tokens_seen": 1019418624 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034904714142427285, + "loss": 3.0064, + "theoretical_loss": 3.642509229206263, + "tokens_seen": 1019484160 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034903711133400203, + "loss": 2.8832, + "theoretical_loss": 3.642487025893308, + "tokens_seen": 1019549696 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490270812437312, + "loss": 2.8028, + "theoretical_loss": 3.642464824407113, + "tokens_seen": 1019615232 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490170511534604, + "loss": 2.867, + "theoretical_loss": 3.64244262474741, + "tokens_seen": 1019680768 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003490070210631896, + "loss": 2.7748, + "theoretical_loss": 3.6424204269139313, + "tokens_seen": 1019746304 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034899699097291876, + "loss": 2.9415, + "theoretical_loss": 3.642398230906409, + "tokens_seen": 1019811840 + }, + { + "epoch": 2.1, + "learning_rate": 0.000348986960882648, + "loss": 2.7153, + "theoretical_loss": 3.6423760367245768, + "tokens_seen": 1019877376 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489769307923771, + "loss": 2.9185, + "theoretical_loss": 3.6423538443681656, + "tokens_seen": 1019942912 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034896690070210636, + "loss": 2.7443, + "theoretical_loss": 3.6423316538369086, + "tokens_seen": 1020008448 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489568706118355, + "loss": 2.9, + "theoretical_loss": 3.642309465130539, + "tokens_seen": 1020073984 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489468405215647, + "loss": 2.854, + "theoretical_loss": 3.642287278248789, + "tokens_seen": 1020139520 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489368104312939, + "loss": 2.9607, + "theoretical_loss": 3.6422650931913916, + "tokens_seen": 1020205056 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489267803410231, + "loss": 2.9513, + "theoretical_loss": 3.642242909958079, + "tokens_seen": 1020270592 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034891675025075226, + "loss": 2.7855, + "theoretical_loss": 3.6422207285485846, + "tokens_seen": 1020336128 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003489067201604815, + "loss": 2.9239, + "theoretical_loss": 3.6421985489626416, + "tokens_seen": 1020401664 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003488966900702106, + "loss": 2.8652, + "theoretical_loss": 3.6421763711999824, + "tokens_seen": 1020467200 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034888665997993986, + "loss": 2.9851, + "theoretical_loss": 3.64215419526034, + "tokens_seen": 1020532736 + }, + { + "epoch": 2.1, + "learning_rate": 0.000348876629889669, + "loss": 2.9712, + "theoretical_loss": 3.642132021143448, + "tokens_seen": 1020598272 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003488665997993982, + "loss": 2.8141, + "theoretical_loss": 3.6421098488490395, + "tokens_seen": 1020663808 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.967958927154541, + "objective/train/theoretical_loss": 3.6420987633851833, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6420987633851833, + "tokens_seen": 1020696576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003488565697091274, + "loss": 3.0624, + "theoretical_loss": 3.642087678376847, + "tokens_seen": 1020729344 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003488465396188566, + "loss": 2.9019, + "theoretical_loss": 3.6420655097266055, + "tokens_seen": 1020794880 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034883650952858577, + "loss": 2.9972, + "theoretical_loss": 3.642043342898046, + "tokens_seen": 1020860416 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034882647943831495, + "loss": 2.9783, + "theoretical_loss": 3.642021177890903, + "tokens_seen": 1020925952 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034881644934804413, + "loss": 2.8279, + "theoretical_loss": 3.64199901470491, + "tokens_seen": 1020991488 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034880641925777336, + "loss": 2.7669, + "theoretical_loss": 3.6419768533398003, + "tokens_seen": 1021057024 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003487963891675025, + "loss": 3.0001, + "theoretical_loss": 3.6419546937953076, + "tokens_seen": 1021122560 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003487863590772317, + "loss": 2.9763, + "theoretical_loss": 3.6419325360711654, + "tokens_seen": 1021188096 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034877632898696085, + "loss": 2.7375, + "theoretical_loss": 3.6419103801671073, + "tokens_seen": 1021253632 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003487662988966901, + "loss": 2.7681, + "theoretical_loss": 3.641888226082867, + "tokens_seen": 1021319168 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034875626880641927, + "loss": 2.9172, + "theoretical_loss": 3.6418660738181785, + "tokens_seen": 1021384704 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034874623871614845, + "loss": 2.8573, + "theoretical_loss": 3.6418439233727757, + "tokens_seen": 1021450240 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034873620862587763, + "loss": 2.8681, + "theoretical_loss": 3.6418217747463917, + "tokens_seen": 1021515776 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034872617853560687, + "loss": 2.9824, + "theoretical_loss": 3.6417996279387617, + "tokens_seen": 1021581312 + }, + { + "epoch": 2.1, + "learning_rate": 0.000348716148445336, + "loss": 2.9389, + "theoretical_loss": 3.6417774829496183, + "tokens_seen": 1021646848 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034870611835506523, + "loss": 2.8716, + "theoretical_loss": 3.6417553397786966, + "tokens_seen": 1021712384 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034869608826479436, + "loss": 2.6348, + "theoretical_loss": 3.64173319842573, + "tokens_seen": 1021777920 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003486860581745236, + "loss": 2.9656, + "theoretical_loss": 3.6417110588904533, + "tokens_seen": 1021843456 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034867602808425277, + "loss": 2.8549, + "theoretical_loss": 3.6416889211726, + "tokens_seen": 1021908992 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034866599799398195, + "loss": 2.8024, + "theoretical_loss": 3.641666785271905, + "tokens_seen": 1021974528 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034865596790371113, + "loss": 2.909, + "theoretical_loss": 3.641644651188102, + "tokens_seen": 1022040064 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003486459378134403, + "loss": 2.8651, + "theoretical_loss": 3.6416225189209266, + "tokens_seen": 1022105600 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003486359077231695, + "loss": 2.8565, + "theoretical_loss": 3.6416003884701116, + "tokens_seen": 1022171136 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034862587763289873, + "loss": 2.8291, + "theoretical_loss": 3.6415782598353927, + "tokens_seen": 1022236672 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034861584754262786, + "loss": 2.6667, + "theoretical_loss": 3.641556133016504, + "tokens_seen": 1022302208 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.046391487121582, + "objective/train/theoretical_loss": 3.641545070287913, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.641545070287913, + "tokens_seen": 1022334976 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003486058174523571, + "loss": 2.9825, + "theoretical_loss": 3.64153400801318, + "tokens_seen": 1022367744 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003485957873620862, + "loss": 2.8264, + "theoretical_loss": 3.6415118848251558, + "tokens_seen": 1022433280 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034858575727181546, + "loss": 2.9158, + "theoretical_loss": 3.6414897634521655, + "tokens_seen": 1022498816 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034857572718154464, + "loss": 2.7892, + "theoretical_loss": 3.641467643893944, + "tokens_seen": 1022564352 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003485656970912738, + "loss": 2.8672, + "theoretical_loss": 3.6414455261502265, + "tokens_seen": 1022629888 + }, + { + "epoch": 2.1, + "learning_rate": 0.000348555667001003, + "loss": 2.6946, + "theoretical_loss": 3.6414234102207477, + "tokens_seen": 1022695424 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034854563691073223, + "loss": 3.0411, + "theoretical_loss": 3.641401296105243, + "tokens_seen": 1022760960 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034853560682046136, + "loss": 2.8604, + "theoretical_loss": 3.641379183803447, + "tokens_seen": 1022826496 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003485255767301906, + "loss": 2.9931, + "theoretical_loss": 3.6413570733150937, + "tokens_seen": 1022892032 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003485155466399197, + "loss": 2.8553, + "theoretical_loss": 3.64133496463992, + "tokens_seen": 1022957568 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034850551654964896, + "loss": 2.7986, + "theoretical_loss": 3.64131285777766, + "tokens_seen": 1023023104 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034849548645937814, + "loss": 2.9911, + "theoretical_loss": 3.6412907527280494, + "tokens_seen": 1023088640 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484854563691073, + "loss": 2.9963, + "theoretical_loss": 3.641268649490823, + "tokens_seen": 1023154176 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484754262788365, + "loss": 2.869, + "theoretical_loss": 3.641246548065716, + "tokens_seen": 1023219712 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484653961885657, + "loss": 2.8549, + "theoretical_loss": 3.641224448452465, + "tokens_seen": 1023285248 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034845536609829486, + "loss": 2.7115, + "theoretical_loss": 3.641202350650804, + "tokens_seen": 1023350784 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484453360080241, + "loss": 2.8883, + "theoretical_loss": 3.64118025466047, + "tokens_seen": 1023416320 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484353059177532, + "loss": 2.7518, + "theoretical_loss": 3.6411581604811967, + "tokens_seen": 1023481856 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034842527582748246, + "loss": 2.9227, + "theoretical_loss": 3.641136068112721, + "tokens_seen": 1023547392 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034841524573721164, + "loss": 2.8329, + "theoretical_loss": 3.6411139775547783, + "tokens_seen": 1023612928 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003484052156469408, + "loss": 2.9665, + "theoretical_loss": 3.6410918888071038, + "tokens_seen": 1023678464 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034839518555667, + "loss": 2.8103, + "theoretical_loss": 3.641069801869434, + "tokens_seen": 1023744000 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003483851554663992, + "loss": 2.8373, + "theoretical_loss": 3.6410477167415047, + "tokens_seen": 1023809536 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034837512537612837, + "loss": 2.8976, + "theoretical_loss": 3.6410256334230513, + "tokens_seen": 1023875072 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003483650952858576, + "loss": 2.9443, + "theoretical_loss": 3.64100355191381, + "tokens_seen": 1023940608 + }, + { + "epoch": 2.1, + "objective/train/docs_used": 1613583, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7551822662353516, + "objective/train/theoretical_loss": 3.6409925118375615, + "objective/train/tokens_used": 1025103328, + "theoretical_loss": 3.6409925118375615, + "tokens_seen": 1023973376 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034835506519558673, + "loss": 2.7437, + "theoretical_loss": 3.6409814722135163, + "tokens_seen": 1024006144 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034834503510531597, + "loss": 2.8725, + "theoretical_loss": 3.640959394321907, + "tokens_seen": 1024071680 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034833500501504515, + "loss": 2.8429, + "theoretical_loss": 3.6409373182387186, + "tokens_seen": 1024137216 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034832497492477433, + "loss": 2.8823, + "theoretical_loss": 3.6409152439636863, + "tokens_seen": 1024202752 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034831494483450356, + "loss": 2.8311, + "theoretical_loss": 3.640893171496546, + "tokens_seen": 1024268288 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003483049147442327, + "loss": 2.9814, + "theoretical_loss": 3.640871100837035, + "tokens_seen": 1024333824 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003482948846539619, + "loss": 2.8962, + "theoretical_loss": 3.6408490319848887, + "tokens_seen": 1024399360 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034828485456369105, + "loss": 2.8844, + "theoretical_loss": 3.640826964939845, + "tokens_seen": 1024464896 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003482748244734203, + "loss": 2.7852, + "theoretical_loss": 3.6408048997016387, + "tokens_seen": 1024530432 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034826479438314947, + "loss": 2.7678, + "theoretical_loss": 3.6407828362700068, + "tokens_seen": 1024595968 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034825476429287865, + "loss": 2.9312, + "theoretical_loss": 3.640760774644686, + "tokens_seen": 1024661504 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034824473420260783, + "loss": 2.748, + "theoretical_loss": 3.640738714825413, + "tokens_seen": 1024727040 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034823470411233707, + "loss": 2.9237, + "theoretical_loss": 3.6407166568119242, + "tokens_seen": 1024792576 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003482246740220662, + "loss": 2.875, + "theoretical_loss": 3.6406946006039567, + "tokens_seen": 1024858112 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034821464393179543, + "loss": 2.7599, + "theoretical_loss": 3.640672546201247, + "tokens_seen": 1024923648 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034820461384152456, + "loss": 2.9538, + "theoretical_loss": 3.6406504936035313, + "tokens_seen": 1024989184 + }, + { + "epoch": 2.1, + "learning_rate": 0.0003481945837512538, + "loss": 2.8967, + "theoretical_loss": 3.6406284428105478, + "tokens_seen": 1025054720 + }, + { + "epoch": 2.1, + "learning_rate": 0.00034818455366098297, + "loss": 2.8687, + "theoretical_loss": 3.6406063938220323, + "tokens_seen": 1025120256 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034817452357071215, + "loss": 3.5754, + "theoretical_loss": 3.6405833132202217, + "tokens_seen": 1025188864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034816449348044133, + "loss": 2.9407, + "theoretical_loss": 3.640561267924408, + "tokens_seen": 1025254400 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003481544633901705, + "loss": 2.7111, + "theoretical_loss": 3.640539224432261, + "tokens_seen": 1025319936 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003481444332998997, + "loss": 2.8528, + "theoretical_loss": 3.640517182743519, + "tokens_seen": 1025385472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034813440320962893, + "loss": 2.87, + "theoretical_loss": 3.640495142857918, + "tokens_seen": 1025451008 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034812437311935806, + "loss": 2.8654, + "theoretical_loss": 3.640473104775196, + "tokens_seen": 1025516544 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003481143430290873, + "loss": 3.014, + "theoretical_loss": 3.640451068495091, + "tokens_seen": 1025582080 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1646905, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0615015029907227, + "objective/train/theoretical_loss": 3.6404455597066923, + "objective/train/tokens_used": 1046058464, + "theoretical_loss": 3.6404455597066923, + "tokens_seen": 1025598464 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003481043129388164, + "loss": 3.0693, + "theoretical_loss": 3.640429034017339, + "tokens_seen": 1025647616 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034809428284854566, + "loss": 2.9091, + "theoretical_loss": 3.6404070013416776, + "tokens_seen": 1025713152 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034808425275827484, + "loss": 2.98, + "theoretical_loss": 3.6403849704678457, + "tokens_seen": 1025778688 + }, + { + "epoch": 3.0, + "learning_rate": 0.000348074222668004, + "loss": 3.0772, + "theoretical_loss": 3.640362941395579, + "tokens_seen": 1025844224 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003480641925777332, + "loss": 3.048, + "theoretical_loss": 3.6403409141246166, + "tokens_seen": 1025909760 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034805416248746243, + "loss": 3.0334, + "theoretical_loss": 3.6403188886546953, + "tokens_seen": 1025975296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034804413239719156, + "loss": 2.8802, + "theoretical_loss": 3.6402968649855527, + "tokens_seen": 1026040832 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003480341023069208, + "loss": 2.9044, + "theoretical_loss": 3.640274843116927, + "tokens_seen": 1026106368 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003480240722166499, + "loss": 2.9394, + "theoretical_loss": 3.6402528230485567, + "tokens_seen": 1026171904 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034801404212637916, + "loss": 2.7026, + "theoretical_loss": 3.6402308047801784, + "tokens_seen": 1026237440 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034800401203610834, + "loss": 2.9466, + "theoretical_loss": 3.64020878831153, + "tokens_seen": 1026302976 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003479939819458375, + "loss": 2.8418, + "theoretical_loss": 3.640186773642351, + "tokens_seen": 1026368512 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003479839518555667, + "loss": 2.9157, + "theoretical_loss": 3.640164760772378, + "tokens_seen": 1026434048 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003479739217652959, + "loss": 2.8191, + "theoretical_loss": 3.640142749701349, + "tokens_seen": 1026499584 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034796389167502506, + "loss": 2.928, + "theoretical_loss": 3.640120740429003, + "tokens_seen": 1026565120 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003479538615847543, + "loss": 2.8075, + "theoretical_loss": 3.640098732955078, + "tokens_seen": 1026630656 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034794383149448343, + "loss": 2.9645, + "theoretical_loss": 3.6400767272793124, + "tokens_seen": 1026696192 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034793380140421266, + "loss": 2.9817, + "theoretical_loss": 3.6400547234014438, + "tokens_seen": 1026761728 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034792377131394184, + "loss": 2.991, + "theoretical_loss": 3.6400327213212114, + "tokens_seen": 1026827264 + }, + { + "epoch": 3.0, + "learning_rate": 0.000347913741223671, + "loss": 3.0088, + "theoretical_loss": 3.6400107210383528, + "tokens_seen": 1026892800 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003479037111334002, + "loss": 2.8336, + "theoretical_loss": 3.6399887225526073, + "tokens_seen": 1026958336 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003478936810431294, + "loss": 2.8219, + "theoretical_loss": 3.639966725863713, + "tokens_seen": 1027023872 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034788365095285857, + "loss": 3.1043, + "theoretical_loss": 3.6399447309714086, + "tokens_seen": 1027089408 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003478736208625878, + "loss": 2.8958, + "theoretical_loss": 3.639922737875432, + "tokens_seen": 1027154944 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034786359077231693, + "loss": 2.9184, + "theoretical_loss": 3.639900746575523, + "tokens_seen": 1027220480 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1649823, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8548176288604736, + "objective/train/theoretical_loss": 3.6398952490311514, + "objective/train/tokens_used": 1047696864, + "theoretical_loss": 3.6398952490311514, + "tokens_seen": 1027236864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034785356068204617, + "loss": 2.873, + "theoretical_loss": 3.63987875707142, + "tokens_seen": 1027286016 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003478435305917753, + "loss": 2.9449, + "theoretical_loss": 3.6398567693628623, + "tokens_seen": 1027351552 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034783350050150453, + "loss": 2.8173, + "theoretical_loss": 3.6398347834495874, + "tokens_seen": 1027417088 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003478234704112337, + "loss": 2.7484, + "theoretical_loss": 3.6398127993313354, + "tokens_seen": 1027482624 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003478134403209629, + "loss": 2.9559, + "theoretical_loss": 3.639790817007845, + "tokens_seen": 1027548160 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034780341023069207, + "loss": 2.9202, + "theoretical_loss": 3.6397688364788543, + "tokens_seen": 1027613696 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034779338014042125, + "loss": 2.8807, + "theoretical_loss": 3.639746857744104, + "tokens_seen": 1027679232 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034778335005015043, + "loss": 2.8968, + "theoretical_loss": 3.639724880803332, + "tokens_seen": 1027744768 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034777331995987967, + "loss": 2.9942, + "theoretical_loss": 3.6397029056562777, + "tokens_seen": 1027810304 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003477632898696088, + "loss": 2.9654, + "theoretical_loss": 3.6396809323026815, + "tokens_seen": 1027875840 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034775325977933803, + "loss": 2.7965, + "theoretical_loss": 3.639658960742281, + "tokens_seen": 1027941376 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003477432296890672, + "loss": 2.9793, + "theoretical_loss": 3.6396369909748163, + "tokens_seen": 1028006912 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003477331995987964, + "loss": 3.0048, + "theoretical_loss": 3.639615023000027, + "tokens_seen": 1028072448 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003477231695085256, + "loss": 3.0501, + "theoretical_loss": 3.6395930568176516, + "tokens_seen": 1028137984 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034771313941825476, + "loss": 2.7782, + "theoretical_loss": 3.639571092427431, + "tokens_seen": 1028203520 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034770310932798394, + "loss": 2.7002, + "theoretical_loss": 3.639549129829104, + "tokens_seen": 1028269056 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034769307923771317, + "loss": 3.0267, + "theoretical_loss": 3.6395271690224105, + "tokens_seen": 1028334592 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003476830491474423, + "loss": 2.9111, + "theoretical_loss": 3.6395052100070897, + "tokens_seen": 1028400128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034767301905717153, + "loss": 2.8161, + "theoretical_loss": 3.639483252782882, + "tokens_seen": 1028465664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034766298896690066, + "loss": 2.9662, + "theoretical_loss": 3.639461297349527, + "tokens_seen": 1028531200 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003476529588766299, + "loss": 3.0294, + "theoretical_loss": 3.639439343706764, + "tokens_seen": 1028596736 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003476429287863591, + "loss": 2.9002, + "theoretical_loss": 3.6394173918543333, + "tokens_seen": 1028662272 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034763289869608826, + "loss": 2.964, + "theoretical_loss": 3.639395441791975, + "tokens_seen": 1028727808 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034762286860581744, + "loss": 2.9344, + "theoretical_loss": 3.639373493519429, + "tokens_seen": 1028793344 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003476128385155466, + "loss": 3.04, + "theoretical_loss": 3.6393515470364353, + "tokens_seen": 1028858880 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1652746, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.185509443283081, + "objective/train/theoretical_loss": 3.6393460606952734, + "objective/train/tokens_used": 1049335264, + "theoretical_loss": 3.6393460606952734, + "tokens_seen": 1028875264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003476028084252758, + "loss": 2.9613, + "theoretical_loss": 3.639329602342734, + "tokens_seen": 1028924416 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034759277833500504, + "loss": 3.004, + "theoretical_loss": 3.6393076594380647, + "tokens_seen": 1028989952 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003475827482447342, + "loss": 2.8539, + "theoretical_loss": 3.639285718322169, + "tokens_seen": 1029055488 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003475727181544634, + "loss": 2.9045, + "theoretical_loss": 3.6392637789947857, + "tokens_seen": 1029121024 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034756268806419263, + "loss": 2.8339, + "theoretical_loss": 3.6392418414556564, + "tokens_seen": 1029186560 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034755265797392176, + "loss": 2.8828, + "theoretical_loss": 3.6392199057045205, + "tokens_seen": 1029252096 + }, + { + "epoch": 3.0, + "learning_rate": 0.000347542627883651, + "loss": 2.793, + "theoretical_loss": 3.639197971741119, + "tokens_seen": 1029317632 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003475325977933801, + "loss": 2.9745, + "theoretical_loss": 3.639176039565192, + "tokens_seen": 1029383168 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034752256770310936, + "loss": 2.9028, + "theoretical_loss": 3.6391541091764803, + "tokens_seen": 1029448704 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034751253761283854, + "loss": 2.9119, + "theoretical_loss": 3.6391321805747245, + "tokens_seen": 1029514240 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003475025075225677, + "loss": 2.8672, + "theoretical_loss": 3.6391102537596653, + "tokens_seen": 1029579776 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474924774322969, + "loss": 2.8761, + "theoretical_loss": 3.6390883287310434, + "tokens_seen": 1029645312 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474824473420261, + "loss": 2.89, + "theoretical_loss": 3.639066405488599, + "tokens_seen": 1029710848 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034747241725175527, + "loss": 2.9516, + "theoretical_loss": 3.639044484032074, + "tokens_seen": 1029776384 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474623871614845, + "loss": 2.8109, + "theoretical_loss": 3.639022564361208, + "tokens_seen": 1029841920 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034745235707121363, + "loss": 2.9481, + "theoretical_loss": 3.6390006464757434, + "tokens_seen": 1029907456 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034744232698094286, + "loss": 2.9956, + "theoretical_loss": 3.63897873037542, + "tokens_seen": 1029972992 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034743229689067204, + "loss": 2.7549, + "theoretical_loss": 3.638956816059979, + "tokens_seen": 1030038528 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474222668004012, + "loss": 2.9334, + "theoretical_loss": 3.638934903529162, + "tokens_seen": 1030104064 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474122367101304, + "loss": 2.75, + "theoretical_loss": 3.6389129927827097, + "tokens_seen": 1030169600 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003474022066198596, + "loss": 3.064, + "theoretical_loss": 3.6388910838203636, + "tokens_seen": 1030235136 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034739217652958877, + "loss": 2.8275, + "theoretical_loss": 3.638869176641865, + "tokens_seen": 1030300672 + }, + { + "epoch": 3.0, + "learning_rate": 0.000347382146439318, + "loss": 2.8109, + "theoretical_loss": 3.6388472712469544, + "tokens_seen": 1030366208 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034737211634904713, + "loss": 2.8259, + "theoretical_loss": 3.638825367635374, + "tokens_seen": 1030431744 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034736208625877637, + "loss": 2.8766, + "theoretical_loss": 3.638803465806865, + "tokens_seen": 1030497280 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1654929, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.742549180984497, + "objective/train/theoretical_loss": 3.638797990628312, + "objective/train/tokens_used": 1050973664, + "theoretical_loss": 3.638797990628312, + "tokens_seen": 1030513664 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003473520561685055, + "loss": 2.7634, + "theoretical_loss": 3.6387815657611684, + "tokens_seen": 1030562816 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034734202607823473, + "loss": 2.9285, + "theoretical_loss": 3.638759667498027, + "tokens_seen": 1030628352 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003473319959879639, + "loss": 2.938, + "theoretical_loss": 3.6387377710171807, + "tokens_seen": 1030693888 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003473219658976931, + "loss": 2.8065, + "theoretical_loss": 3.6387158763183725, + "tokens_seen": 1030759424 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034731193580742227, + "loss": 2.9391, + "theoretical_loss": 3.6386939834013434, + "tokens_seen": 1030824960 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034730190571715145, + "loss": 2.8401, + "theoretical_loss": 3.6386720922658347, + "tokens_seen": 1030890496 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034729187562688063, + "loss": 3.0311, + "theoretical_loss": 3.6386502029115895, + "tokens_seen": 1030956032 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034728184553660987, + "loss": 3.0004, + "theoretical_loss": 3.6386283153383485, + "tokens_seen": 1031021568 + }, + { + "epoch": 3.0, + "learning_rate": 0.000347271815446339, + "loss": 2.9875, + "theoretical_loss": 3.6386064295458542, + "tokens_seen": 1031087104 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034726178535606823, + "loss": 2.9621, + "theoretical_loss": 3.638584545533848, + "tokens_seen": 1031152640 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003472517552657974, + "loss": 2.9238, + "theoretical_loss": 3.6385626633020722, + "tokens_seen": 1031218176 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003472417251755266, + "loss": 3.0336, + "theoretical_loss": 3.63854078285027, + "tokens_seen": 1031283712 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003472316950852558, + "loss": 3.0282, + "theoretical_loss": 3.638518904178181, + "tokens_seen": 1031349248 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034722166499498496, + "loss": 2.8323, + "theoretical_loss": 3.638497027285549, + "tokens_seen": 1031414784 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034721163490471414, + "loss": 2.9097, + "theoretical_loss": 3.6384751521721164, + "tokens_seen": 1031480320 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034720160481444337, + "loss": 2.8935, + "theoretical_loss": 3.638453278837625, + "tokens_seen": 1031545856 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003471915747241725, + "loss": 2.9341, + "theoretical_loss": 3.6384314072818174, + "tokens_seen": 1031611392 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034718154463390173, + "loss": 2.7556, + "theoretical_loss": 3.6384095375044354, + "tokens_seen": 1031676928 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034717151454363086, + "loss": 2.8752, + "theoretical_loss": 3.6383876695052217, + "tokens_seen": 1031742464 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003471614844533601, + "loss": 2.9185, + "theoretical_loss": 3.6383658032839192, + "tokens_seen": 1031808000 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003471514543630893, + "loss": 2.9834, + "theoretical_loss": 3.6383439388402703, + "tokens_seen": 1031873536 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034714142427281846, + "loss": 2.8072, + "theoretical_loss": 3.638322076174017, + "tokens_seen": 1031939072 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034713139418254764, + "loss": 2.9992, + "theoretical_loss": 3.6383002152849024, + "tokens_seen": 1032004608 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003471213640922768, + "loss": 2.9192, + "theoretical_loss": 3.6382783561726697, + "tokens_seen": 1032070144 + }, + { + "epoch": 3.0, + "learning_rate": 0.000347111334002006, + "loss": 2.8388, + "theoretical_loss": 3.6382564988370607, + "tokens_seen": 1032135680 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1657823, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9087984561920166, + "objective/train/theoretical_loss": 3.6382510347807258, + "objective/train/tokens_used": 1052612064, + "theoretical_loss": 3.6382510347807258, + "tokens_seen": 1032152064 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034710130391173524, + "loss": 2.864, + "theoretical_loss": 3.6382346432778183, + "tokens_seen": 1032201216 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034709127382146436, + "loss": 2.7893, + "theoretical_loss": 3.638212789494686, + "tokens_seen": 1032266752 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003470812437311936, + "loss": 2.8027, + "theoretical_loss": 3.6381909374874066, + "tokens_seen": 1032332288 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003470712136409228, + "loss": 3.0012, + "theoretical_loss": 3.638169087255723, + "tokens_seen": 1032397824 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034706118355065196, + "loss": 3.0612, + "theoretical_loss": 3.6381472387993776, + "tokens_seen": 1032463360 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034705115346038114, + "loss": 2.8909, + "theoretical_loss": 3.6381253921181145, + "tokens_seen": 1032528896 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003470411233701103, + "loss": 2.9222, + "theoretical_loss": 3.638103547211676, + "tokens_seen": 1032594432 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003470310932798395, + "loss": 2.9052, + "theoretical_loss": 3.6380817040798057, + "tokens_seen": 1032659968 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034702106318956874, + "loss": 3.0259, + "theoretical_loss": 3.6380598627222467, + "tokens_seen": 1032725504 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034701103309929787, + "loss": 2.9093, + "theoretical_loss": 3.6380380231387424, + "tokens_seen": 1032791040 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003470010030090271, + "loss": 2.9559, + "theoretical_loss": 3.638016185329036, + "tokens_seen": 1032856576 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034699097291875623, + "loss": 2.9318, + "theoretical_loss": 3.637994349292871, + "tokens_seen": 1032922112 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034698094282848547, + "loss": 2.7813, + "theoretical_loss": 3.6379725150299906, + "tokens_seen": 1032987648 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034697091273821465, + "loss": 2.9821, + "theoretical_loss": 3.637950682540139, + "tokens_seen": 1033053184 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034696088264794383, + "loss": 2.7882, + "theoretical_loss": 3.637928851823059, + "tokens_seen": 1033118720 + }, + { + "epoch": 3.0, + "learning_rate": 0.000346950852557673, + "loss": 2.8885, + "theoretical_loss": 3.6379070228784944, + "tokens_seen": 1033184256 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034694082246740224, + "loss": 3.0143, + "theoretical_loss": 3.6378851957061893, + "tokens_seen": 1033249792 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034693079237713137, + "loss": 2.8831, + "theoretical_loss": 3.6378633703058867, + "tokens_seen": 1033315328 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003469207622868606, + "loss": 2.8981, + "theoretical_loss": 3.637841546677331, + "tokens_seen": 1033380864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034691073219658973, + "loss": 2.9588, + "theoretical_loss": 3.6378197248202655, + "tokens_seen": 1033446400 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034690070210631897, + "loss": 2.9262, + "theoretical_loss": 3.637797904734435, + "tokens_seen": 1033511936 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034689067201604815, + "loss": 2.923, + "theoretical_loss": 3.6377760864195814, + "tokens_seen": 1033577472 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034688064192577733, + "loss": 2.9195, + "theoretical_loss": 3.6377542698754515, + "tokens_seen": 1033643008 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003468706118355065, + "loss": 2.9283, + "theoretical_loss": 3.637732455101787, + "tokens_seen": 1033708544 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003468605817452357, + "loss": 2.9479, + "theoretical_loss": 3.6377106420983334, + "tokens_seen": 1033774080 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1660748, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.3461458683013916, + "objective/train/theoretical_loss": 3.637705189124035, + "objective/train/tokens_used": 1054250464, + "theoretical_loss": 3.637705189124035, + "tokens_seen": 1033790464 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003468505516549649, + "loss": 2.9617, + "theoretical_loss": 3.637688830864834, + "tokens_seen": 1033839616 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003468405215646941, + "loss": 2.8218, + "theoretical_loss": 3.6376670214010334, + "tokens_seen": 1033905152 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003468304914744233, + "loss": 2.7174, + "theoretical_loss": 3.637645213706676, + "tokens_seen": 1033970688 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034682046138415247, + "loss": 2.8647, + "theoretical_loss": 3.637623407781506, + "tokens_seen": 1034036224 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034681043129388165, + "loss": 2.8244, + "theoretical_loss": 3.637601603625267, + "tokens_seen": 1034101760 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034680040120361083, + "loss": 2.8523, + "theoretical_loss": 3.6375798012377043, + "tokens_seen": 1034167296 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034679037111334007, + "loss": 3.0134, + "theoretical_loss": 3.6375580006185624, + "tokens_seen": 1034232832 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003467803410230692, + "loss": 2.848, + "theoretical_loss": 3.6375362017675856, + "tokens_seen": 1034298368 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034677031093279843, + "loss": 2.9657, + "theoretical_loss": 3.6375144046845183, + "tokens_seen": 1034363904 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003467602808425276, + "loss": 3.0835, + "theoretical_loss": 3.6374926093691045, + "tokens_seen": 1034429440 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003467502507522568, + "loss": 3.039, + "theoretical_loss": 3.63747081582109, + "tokens_seen": 1034494976 + }, + { + "epoch": 3.0, + "learning_rate": 0.000346740220661986, + "loss": 2.7411, + "theoretical_loss": 3.6374490240402197, + "tokens_seen": 1034560512 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034673019057171516, + "loss": 2.8664, + "theoretical_loss": 3.6374272340262372, + "tokens_seen": 1034626048 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034672016048144434, + "loss": 2.8988, + "theoretical_loss": 3.6374054457788882, + "tokens_seen": 1034691584 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034671013039117357, + "loss": 2.9694, + "theoretical_loss": 3.6373836592979174, + "tokens_seen": 1034757120 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003467001003009027, + "loss": 2.9078, + "theoretical_loss": 3.6373618745830694, + "tokens_seen": 1034822656 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034669007021063193, + "loss": 2.6788, + "theoretical_loss": 3.6373400916340892, + "tokens_seen": 1034888192 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034668004012036106, + "loss": 2.9222, + "theoretical_loss": 3.6373183104507225, + "tokens_seen": 1034953728 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003466700100300903, + "loss": 2.8393, + "theoretical_loss": 3.637296531032714, + "tokens_seen": 1035019264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003466599799398195, + "loss": 2.8588, + "theoretical_loss": 3.637274753379809, + "tokens_seen": 1035084800 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034664994984954866, + "loss": 3.0493, + "theoretical_loss": 3.637252977491752, + "tokens_seen": 1035150336 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034663991975927784, + "loss": 2.8487, + "theoretical_loss": 3.637231203368289, + "tokens_seen": 1035215872 + }, + { + "epoch": 3.0, + "learning_rate": 0.000346629889669007, + "loss": 2.9659, + "theoretical_loss": 3.6372094310091656, + "tokens_seen": 1035281408 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003466198595787362, + "loss": 2.8545, + "theoretical_loss": 3.637187660414126, + "tokens_seen": 1035346944 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034660982948846544, + "loss": 2.8893, + "theoretical_loss": 3.637165891582917, + "tokens_seen": 1035412480 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1663752, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.770439863204956, + "objective/train/theoretical_loss": 3.6371604496506826, + "objective/train/tokens_used": 1055888864, + "theoretical_loss": 3.6371604496506826, + "tokens_seen": 1035428864 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034659979939819456, + "loss": 2.8082, + "theoretical_loss": 3.6371441245152827, + "tokens_seen": 1035478016 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003465897693079238, + "loss": 2.7911, + "theoretical_loss": 3.6371223592109696, + "tokens_seen": 1035543552 + }, + { + "epoch": 3.0, + "learning_rate": 0.000346579739217653, + "loss": 2.8547, + "theoretical_loss": 3.637100595669723, + "tokens_seen": 1035609088 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034656970912738216, + "loss": 2.8255, + "theoretical_loss": 3.6370788338912883, + "tokens_seen": 1035674624 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034655967903711134, + "loss": 2.9, + "theoretical_loss": 3.6370570738754116, + "tokens_seen": 1035740160 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003465496489468405, + "loss": 2.7747, + "theoretical_loss": 3.6370353156218385, + "tokens_seen": 1035805696 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003465396188565697, + "loss": 3.0165, + "theoretical_loss": 3.6370135591303145, + "tokens_seen": 1035871232 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034652958876629894, + "loss": 2.8448, + "theoretical_loss": 3.636991804400586, + "tokens_seen": 1035936768 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034651955867602807, + "loss": 2.8632, + "theoretical_loss": 3.6369700514323977, + "tokens_seen": 1036002304 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003465095285857573, + "loss": 3.0765, + "theoretical_loss": 3.636948300225497, + "tokens_seen": 1036067840 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034649949849548643, + "loss": 2.9344, + "theoretical_loss": 3.636926550779629, + "tokens_seen": 1036133376 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034648946840521567, + "loss": 2.9422, + "theoretical_loss": 3.6369048030945406, + "tokens_seen": 1036198912 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034647943831494485, + "loss": 2.9271, + "theoretical_loss": 3.6368830571699764, + "tokens_seen": 1036264448 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034646940822467403, + "loss": 2.8969, + "theoretical_loss": 3.636861313005684, + "tokens_seen": 1036329984 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003464593781344032, + "loss": 2.8427, + "theoretical_loss": 3.636839570601409, + "tokens_seen": 1036395520 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034644934804413244, + "loss": 3.0296, + "theoretical_loss": 3.6368178299568976, + "tokens_seen": 1036461056 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034643931795386157, + "loss": 2.9554, + "theoretical_loss": 3.6367960910718966, + "tokens_seen": 1036526592 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003464292878635908, + "loss": 2.862, + "theoretical_loss": 3.6367743539461515, + "tokens_seen": 1036592128 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034641925777331993, + "loss": 2.9518, + "theoretical_loss": 3.63675261857941, + "tokens_seen": 1036657664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034640922768304917, + "loss": 2.9892, + "theoretical_loss": 3.636730884971417, + "tokens_seen": 1036723200 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034639919759277835, + "loss": 2.7551, + "theoretical_loss": 3.6367091531219202, + "tokens_seen": 1036788736 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034638916750250753, + "loss": 2.8759, + "theoretical_loss": 3.636687423030666, + "tokens_seen": 1036854272 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003463791374122367, + "loss": 3.0031, + "theoretical_loss": 3.6366656946974008, + "tokens_seen": 1036919808 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003463691073219659, + "loss": 2.8214, + "theoretical_loss": 3.636643968121871, + "tokens_seen": 1036985344 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003463590772316951, + "loss": 3.1173, + "theoretical_loss": 3.6366222433038238, + "tokens_seen": 1037050880 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1666738, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.523831605911255, + "objective/train/theoretical_loss": 3.6366168123738887, + "objective/train/tokens_used": 1057527264, + "theoretical_loss": 3.6366168123738887, + "tokens_seen": 1037067264 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003463490471414243, + "loss": 2.6589, + "theoretical_loss": 3.6366005202430056, + "tokens_seen": 1037116416 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034633901705115344, + "loss": 2.9006, + "theoretical_loss": 3.636578798939164, + "tokens_seen": 1037181952 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034632898696088267, + "loss": 2.8813, + "theoretical_loss": 3.6365570793920448, + "tokens_seen": 1037247488 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003463189568706118, + "loss": 2.9762, + "theoretical_loss": 3.636535361601396, + "tokens_seen": 1037313024 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034630892678034103, + "loss": 3.0119, + "theoretical_loss": 3.6365136455669638, + "tokens_seen": 1037378560 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462988966900702, + "loss": 2.9595, + "theoretical_loss": 3.6364919312884956, + "tokens_seen": 1037444096 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462888665997994, + "loss": 2.8737, + "theoretical_loss": 3.636470218765738, + "tokens_seen": 1037509632 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462788365095286, + "loss": 2.9562, + "theoretical_loss": 3.6364485079984394, + "tokens_seen": 1037575168 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462688064192578, + "loss": 2.9184, + "theoretical_loss": 3.6364267989863457, + "tokens_seen": 1037640704 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034625877632898694, + "loss": 2.88, + "theoretical_loss": 3.6364050917292046, + "tokens_seen": 1037706240 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462487462387162, + "loss": 2.89, + "theoretical_loss": 3.636383386226764, + "tokens_seen": 1037771776 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462387161484453, + "loss": 2.7583, + "theoretical_loss": 3.6363616824787703, + "tokens_seen": 1037837312 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034622868605817454, + "loss": 2.773, + "theoretical_loss": 3.636339980484972, + "tokens_seen": 1037902848 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462186559679037, + "loss": 3.0198, + "theoretical_loss": 3.6363182802451153, + "tokens_seen": 1037968384 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003462086258776329, + "loss": 2.7816, + "theoretical_loss": 3.6362965817589483, + "tokens_seen": 1038033920 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003461985957873621, + "loss": 2.9151, + "theoretical_loss": 3.6362748850262188, + "tokens_seen": 1038099456 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034618856569709126, + "loss": 2.9992, + "theoretical_loss": 3.6362531900466744, + "tokens_seen": 1038164992 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034617853560682044, + "loss": 2.8241, + "theoretical_loss": 3.636231496820062, + "tokens_seen": 1038230528 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003461685055165497, + "loss": 2.8755, + "theoretical_loss": 3.6362098053461303, + "tokens_seen": 1038296064 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003461584754262788, + "loss": 2.8178, + "theoretical_loss": 3.636188115624627, + "tokens_seen": 1038361600 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034614844533600804, + "loss": 2.854, + "theoretical_loss": 3.636166427655299, + "tokens_seen": 1038427136 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034613841524573717, + "loss": 2.8668, + "theoretical_loss": 3.6361447414378953, + "tokens_seen": 1038492672 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003461283851554664, + "loss": 2.7562, + "theoretical_loss": 3.6361230569721634, + "tokens_seen": 1038558208 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003461183550651956, + "loss": 2.684, + "theoretical_loss": 3.6361013742578514, + "tokens_seen": 1038623744 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034610832497492476, + "loss": 2.8295, + "theoretical_loss": 3.6360796932947066, + "tokens_seen": 1038689280 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1668593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9363644123077393, + "objective/train/theoretical_loss": 3.636074273327511, + "objective/train/tokens_used": 1059165664, + "theoretical_loss": 3.636074273327511, + "tokens_seen": 1038705664 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034609829488465395, + "loss": 3.0061, + "theoretical_loss": 3.636058014082478, + "tokens_seen": 1038754816 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003460882647943832, + "loss": 3.0417, + "theoretical_loss": 3.6360363366209136, + "tokens_seen": 1038820352 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034607823470411236, + "loss": 2.8592, + "theoretical_loss": 3.6360146609097614, + "tokens_seen": 1038885888 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034606820461384154, + "loss": 2.9711, + "theoretical_loss": 3.6359929869487697, + "tokens_seen": 1038951424 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003460581745235707, + "loss": 3.0233, + "theoretical_loss": 3.635971314737686, + "tokens_seen": 1039016960 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003460481444332999, + "loss": 2.6607, + "theoretical_loss": 3.635949644276261, + "tokens_seen": 1039082496 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034603811434302914, + "loss": 2.95, + "theoretical_loss": 3.6359279755642406, + "tokens_seen": 1039148032 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034602808425275827, + "loss": 2.8553, + "theoretical_loss": 3.635906308601374, + "tokens_seen": 1039213568 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003460180541624875, + "loss": 2.9871, + "theoretical_loss": 3.6358846433874104, + "tokens_seen": 1039279104 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034600802407221663, + "loss": 2.8623, + "theoretical_loss": 3.6358629799220976, + "tokens_seen": 1039344640 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034599799398194587, + "loss": 2.8679, + "theoretical_loss": 3.635841318205185, + "tokens_seen": 1039410176 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034598796389167505, + "loss": 2.9139, + "theoretical_loss": 3.63581965823642, + "tokens_seen": 1039475712 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034597793380140423, + "loss": 2.9544, + "theoretical_loss": 3.6357980000155523, + "tokens_seen": 1039541248 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003459679037111334, + "loss": 2.7788, + "theoretical_loss": 3.635776343542331, + "tokens_seen": 1039606784 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034595787362086264, + "loss": 2.8122, + "theoretical_loss": 3.6357546888165038, + "tokens_seen": 1039672320 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034594784353059177, + "loss": 2.876, + "theoretical_loss": 3.635733035837821, + "tokens_seen": 1039737856 + }, + { + "epoch": 3.0, + "learning_rate": 0.000345937813440321, + "loss": 2.9465, + "theoretical_loss": 3.6357113846060294, + "tokens_seen": 1039803392 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034592778335005013, + "loss": 2.9337, + "theoretical_loss": 3.63568973512088, + "tokens_seen": 1039868928 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034591775325977937, + "loss": 2.9354, + "theoretical_loss": 3.635668087382121, + "tokens_seen": 1039934464 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034590772316950855, + "loss": 3.0546, + "theoretical_loss": 3.6356464413895013, + "tokens_seen": 1040000000 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034589769307923773, + "loss": 2.9092, + "theoretical_loss": 3.6356247971427704, + "tokens_seen": 1040065536 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003458876629889669, + "loss": 2.9826, + "theoretical_loss": 3.635603154641678, + "tokens_seen": 1040131072 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003458776328986961, + "loss": 2.7682, + "theoretical_loss": 3.6355815138859717, + "tokens_seen": 1040196608 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003458676028084253, + "loss": 2.8827, + "theoretical_loss": 3.635559874875402, + "tokens_seen": 1040262144 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003458575727181545, + "loss": 2.7124, + "theoretical_loss": 3.635538237609719, + "tokens_seen": 1040327680 + }, + { + "epoch": 3.0, + "objective/train/docs_used": 1671371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.772104024887085, + "objective/train/theoretical_loss": 3.635532828565907, + "objective/train/tokens_used": 1060804064, + "theoretical_loss": 3.635532828565907, + "tokens_seen": 1040344064 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034584754262788364, + "loss": 2.8799, + "theoretical_loss": 3.63551660208867, + "tokens_seen": 1040393216 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034583751253761287, + "loss": 2.9766, + "theoretical_loss": 3.635494968312006, + "tokens_seen": 1040458752 + }, + { + "epoch": 3.0, + "learning_rate": 0.000345827482447342, + "loss": 3.0137, + "theoretical_loss": 3.6354733362794764, + "tokens_seen": 1040524288 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034581745235707123, + "loss": 2.8067, + "theoretical_loss": 3.63545170599083, + "tokens_seen": 1040589824 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003458074222668004, + "loss": 2.756, + "theoretical_loss": 3.6354300774458173, + "tokens_seen": 1040655360 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457973921765296, + "loss": 2.9266, + "theoretical_loss": 3.635408450644187, + "tokens_seen": 1040720896 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457873620862588, + "loss": 2.9658, + "theoretical_loss": 3.635386825585689, + "tokens_seen": 1040786432 + }, + { + "epoch": 3.0, + "learning_rate": 0.000345777331995988, + "loss": 2.918, + "theoretical_loss": 3.6353652022700738, + "tokens_seen": 1040851968 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034576730190571714, + "loss": 2.9504, + "theoretical_loss": 3.635343580697091, + "tokens_seen": 1040917504 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457572718154464, + "loss": 2.8973, + "theoretical_loss": 3.6353219608664897, + "tokens_seen": 1040983040 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457472417251755, + "loss": 3.0661, + "theoretical_loss": 3.635300342778021, + "tokens_seen": 1041048576 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034573721163490474, + "loss": 2.9043, + "theoretical_loss": 3.6352787264314332, + "tokens_seen": 1041114112 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457271815446339, + "loss": 3.1037, + "theoretical_loss": 3.635257111826478, + "tokens_seen": 1041179648 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457171514543631, + "loss": 2.9321, + "theoretical_loss": 3.6352354989629045, + "tokens_seen": 1041245184 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003457071213640923, + "loss": 2.8573, + "theoretical_loss": 3.635213887840463, + "tokens_seen": 1041310720 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034569709127382146, + "loss": 2.9066, + "theoretical_loss": 3.635192278458904, + "tokens_seen": 1041376256 + }, + { + "epoch": 3.0, + "learning_rate": 0.00034568706118355064, + "loss": 3.0229, + "theoretical_loss": 3.6351706708179767, + "tokens_seen": 1041441792 + }, + { + "epoch": 3.0, + "learning_rate": 0.0003456770310932799, + "loss": 2.8161, + "theoretical_loss": 3.6351490649174334, + "tokens_seen": 1041507328 + }, + { + "epoch": 3.0, + "learning_rate": 0.000345667001003009, + "loss": 2.9287, + "theoretical_loss": 3.635127460757022, + "tokens_seen": 1041572864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034565697091273824, + "loss": 2.8224, + "theoretical_loss": 3.6351058583364946, + "tokens_seen": 1041638400 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034564694082246737, + "loss": 2.8641, + "theoretical_loss": 3.635084257655601, + "tokens_seen": 1041703936 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003456369107321966, + "loss": 2.9568, + "theoretical_loss": 3.6350626587140913, + "tokens_seen": 1041769472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003456268806419258, + "loss": 2.8227, + "theoretical_loss": 3.635041061511717, + "tokens_seen": 1041835008 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034561685055165496, + "loss": 2.906, + "theoretical_loss": 3.635019466048228, + "tokens_seen": 1041900544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034560682046138415, + "loss": 2.8525, + "theoretical_loss": 3.6349978723233747, + "tokens_seen": 1041966080 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1674131, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9455406665802, + "objective/train/theoretical_loss": 3.634992474163794, + "objective/train/tokens_used": 1062442464, + "theoretical_loss": 3.634992474163794, + "tokens_seen": 1041982464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003455967903711134, + "loss": 2.7669, + "theoretical_loss": 3.6349762803369083, + "tokens_seen": 1042031616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003455867602808425, + "loss": 2.9773, + "theoretical_loss": 3.6349546900885796, + "tokens_seen": 1042097152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034557673019057174, + "loss": 2.817, + "theoretical_loss": 3.634933101578139, + "tokens_seen": 1042162688 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034556670010030087, + "loss": 2.8073, + "theoretical_loss": 3.6349115148053377, + "tokens_seen": 1042228224 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003455566700100301, + "loss": 3.0236, + "theoretical_loss": 3.634889929769926, + "tokens_seen": 1042293760 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003455466399197593, + "loss": 2.7965, + "theoretical_loss": 3.6348683464716554, + "tokens_seen": 1042359296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034553660982948847, + "loss": 2.9556, + "theoretical_loss": 3.634846764910277, + "tokens_seen": 1042424832 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034552657973921765, + "loss": 2.9714, + "theoretical_loss": 3.634825185085541, + "tokens_seen": 1042490368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034551654964894683, + "loss": 3.0543, + "theoretical_loss": 3.634803606997199, + "tokens_seen": 1042555904 + }, + { + "epoch": 3.01, + "learning_rate": 0.000345506519558676, + "loss": 2.8726, + "theoretical_loss": 3.634782030645003, + "tokens_seen": 1042621440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034549648946840525, + "loss": 2.9438, + "theoretical_loss": 3.634760456028703, + "tokens_seen": 1042686976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003454864593781344, + "loss": 2.864, + "theoretical_loss": 3.6347388831480503, + "tokens_seen": 1042752512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003454764292878636, + "loss": 2.9229, + "theoretical_loss": 3.634717312002797, + "tokens_seen": 1042818048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034546639919759274, + "loss": 2.8834, + "theoretical_loss": 3.634695742592694, + "tokens_seen": 1042883584 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034545636910732197, + "loss": 2.956, + "theoretical_loss": 3.6346741749174925, + "tokens_seen": 1042949120 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034544633901705115, + "loss": 2.9208, + "theoretical_loss": 3.6346526089769435, + "tokens_seen": 1043014656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034543630892678033, + "loss": 2.9915, + "theoretical_loss": 3.6346310447708, + "tokens_seen": 1043080192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003454262788365095, + "loss": 2.8574, + "theoretical_loss": 3.6346094822988126, + "tokens_seen": 1043145728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034541624874623875, + "loss": 2.9268, + "theoretical_loss": 3.6345879215607333, + "tokens_seen": 1043211264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003454062186559679, + "loss": 2.9771, + "theoretical_loss": 3.6345663625563125, + "tokens_seen": 1043276800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003453961885656971, + "loss": 2.9449, + "theoretical_loss": 3.6345448052853038, + "tokens_seen": 1043342336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034538615847542624, + "loss": 2.9393, + "theoretical_loss": 3.634523249747458, + "tokens_seen": 1043407872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003453761283851555, + "loss": 3.0049, + "theoretical_loss": 3.6345016959425265, + "tokens_seen": 1043473408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034536609829488466, + "loss": 2.8435, + "theoretical_loss": 3.6344801438702614, + "tokens_seen": 1043538944 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034535606820461384, + "loss": 3.0526, + "theoretical_loss": 3.6344585935304154, + "tokens_seen": 1043604480 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1677074, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.701413869857788, + "objective/train/theoretical_loss": 3.634453206216115, + "objective/train/tokens_used": 1064080864, + "theoretical_loss": 3.634453206216115, + "tokens_seen": 1043620864 + }, + { + "epoch": 3.01, + "learning_rate": 0.000345346038114343, + "loss": 2.7629, + "theoretical_loss": 3.63443704492274, + "tokens_seen": 1043670016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003453360080240722, + "loss": 2.8483, + "theoretical_loss": 3.6344154980469865, + "tokens_seen": 1043735552 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034532597793380143, + "loss": 2.822, + "theoretical_loss": 3.634393952902908, + "tokens_seen": 1043801088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003453159478435306, + "loss": 2.8139, + "theoretical_loss": 3.634372409490256, + "tokens_seen": 1043866624 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003453059177532598, + "loss": 2.8287, + "theoretical_loss": 3.6343508678087826, + "tokens_seen": 1043932160 + }, + { + "epoch": 3.01, + "learning_rate": 0.000345295887662989, + "loss": 2.8921, + "theoretical_loss": 3.6343293278582407, + "tokens_seen": 1043997696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452858575727182, + "loss": 2.7233, + "theoretical_loss": 3.634307789638382, + "tokens_seen": 1044063232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034527582748244734, + "loss": 2.9894, + "theoretical_loss": 3.634286253148959, + "tokens_seen": 1044128768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452657973921766, + "loss": 2.8864, + "theoretical_loss": 3.634264718389724, + "tokens_seen": 1044194304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452557673019057, + "loss": 2.9403, + "theoretical_loss": 3.63424318536043, + "tokens_seen": 1044259840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034524573721163494, + "loss": 2.9731, + "theoretical_loss": 3.634221654060828, + "tokens_seen": 1044325376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452357071213641, + "loss": 3.0012, + "theoretical_loss": 3.634200124490672, + "tokens_seen": 1044390912 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452256770310933, + "loss": 2.7391, + "theoretical_loss": 3.634178596649714, + "tokens_seen": 1044456448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003452156469408225, + "loss": 2.9031, + "theoretical_loss": 3.6341570705377064, + "tokens_seen": 1044521984 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034520561685055166, + "loss": 2.9336, + "theoretical_loss": 3.6341355461544023, + "tokens_seen": 1044587520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034519558676028084, + "loss": 2.8874, + "theoretical_loss": 3.6341140234995546, + "tokens_seen": 1044653056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003451855566700101, + "loss": 2.8861, + "theoretical_loss": 3.6340925025729147, + "tokens_seen": 1044718592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003451755265797392, + "loss": 3.0419, + "theoretical_loss": 3.6340709833742375, + "tokens_seen": 1044784128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034516549648946844, + "loss": 2.9842, + "theoretical_loss": 3.6340494659032747, + "tokens_seen": 1044849664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034515546639919757, + "loss": 2.8891, + "theoretical_loss": 3.6340279501597794, + "tokens_seen": 1044915200 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003451454363089268, + "loss": 2.829, + "theoretical_loss": 3.6340064361435047, + "tokens_seen": 1044980736 + }, + { + "epoch": 3.01, + "learning_rate": 0.000345135406218656, + "loss": 2.6982, + "theoretical_loss": 3.6339849238542032, + "tokens_seen": 1045046272 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034512537612838517, + "loss": 2.8852, + "theoretical_loss": 3.633963413291628, + "tokens_seen": 1045111808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034511534603811435, + "loss": 2.8421, + "theoretical_loss": 3.6339419044555332, + "tokens_seen": 1045177344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003451053159478436, + "loss": 2.7891, + "theoretical_loss": 3.633920397345671, + "tokens_seen": 1045242880 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1678483, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9230730533599854, + "objective/train/theoretical_loss": 3.6339150208379003, + "objective/train/tokens_used": 1065719264, + "theoretical_loss": 3.6339150208379003, + "tokens_seen": 1045259264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003450952858575727, + "loss": 2.9192, + "theoretical_loss": 3.6338988919617954, + "tokens_seen": 1045308416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034508525576730194, + "loss": 2.7847, + "theoretical_loss": 3.6338773883036586, + "tokens_seen": 1045373952 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034507522567703107, + "loss": 2.8944, + "theoretical_loss": 3.6338558863710153, + "tokens_seen": 1045439488 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003450651955867603, + "loss": 2.9064, + "theoretical_loss": 3.6338343861636173, + "tokens_seen": 1045505024 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003450551654964895, + "loss": 3.129, + "theoretical_loss": 3.6338128876812195, + "tokens_seen": 1045570560 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034504513540621867, + "loss": 2.8684, + "theoretical_loss": 3.633791390923575, + "tokens_seen": 1045636096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034503510531594785, + "loss": 2.7863, + "theoretical_loss": 3.6337698958904365, + "tokens_seen": 1045701632 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034502507522567703, + "loss": 2.8542, + "theoretical_loss": 3.6337484025815585, + "tokens_seen": 1045767168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003450150451354062, + "loss": 2.7085, + "theoretical_loss": 3.6337269109966943, + "tokens_seen": 1045832704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034500501504513545, + "loss": 2.8002, + "theoretical_loss": 3.6337054211355984, + "tokens_seen": 1045898240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003449949849548646, + "loss": 2.8747, + "theoretical_loss": 3.633683932998023, + "tokens_seen": 1045963776 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003449849548645938, + "loss": 2.6989, + "theoretical_loss": 3.633662446583723, + "tokens_seen": 1046029312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034497492477432294, + "loss": 2.7764, + "theoretical_loss": 3.6336409618924517, + "tokens_seen": 1046094848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034496489468405217, + "loss": 2.8394, + "theoretical_loss": 3.6336194789239635, + "tokens_seen": 1046160384 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034495486459378135, + "loss": 2.9563, + "theoretical_loss": 3.633597997678012, + "tokens_seen": 1046225920 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034494483450351053, + "loss": 2.8579, + "theoretical_loss": 3.6335765181543516, + "tokens_seen": 1046291456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003449348044132397, + "loss": 2.9404, + "theoretical_loss": 3.6335550403527357, + "tokens_seen": 1046356992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034492477432296895, + "loss": 2.895, + "theoretical_loss": 3.633533564272919, + "tokens_seen": 1046422528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003449147442326981, + "loss": 2.816, + "theoretical_loss": 3.633512089914656, + "tokens_seen": 1046488064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003449047141424273, + "loss": 2.8131, + "theoretical_loss": 3.6334906172776993, + "tokens_seen": 1046553600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034489468405215644, + "loss": 2.8809, + "theoretical_loss": 3.6334691463618043, + "tokens_seen": 1046619136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448846539618857, + "loss": 2.991, + "theoretical_loss": 3.6334476771667252, + "tokens_seen": 1046684672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034487462387161486, + "loss": 2.7835, + "theoretical_loss": 3.6334262096922165, + "tokens_seen": 1046750208 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034486459378134404, + "loss": 2.7837, + "theoretical_loss": 3.633404743938032, + "tokens_seen": 1046815744 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448545636910732, + "loss": 2.9417, + "theoretical_loss": 3.633383279903927, + "tokens_seen": 1046881280 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1681281, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9433462619781494, + "objective/train/theoretical_loss": 3.633377914164134, + "objective/train/tokens_used": 1067357664, + "theoretical_loss": 3.633377914164134, + "tokens_seen": 1046897664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448445336008024, + "loss": 2.9767, + "theoretical_loss": 3.6333618175896554, + "tokens_seen": 1046946816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448345035105316, + "loss": 2.8803, + "theoretical_loss": 3.633340356994972, + "tokens_seen": 1047012352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448244734202608, + "loss": 3.0119, + "theoretical_loss": 3.633318898119631, + "tokens_seen": 1047077888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034481444332998994, + "loss": 2.8175, + "theoretical_loss": 3.6332974409633874, + "tokens_seen": 1047143424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003448044132397192, + "loss": 2.8023, + "theoretical_loss": 3.633275985525996, + "tokens_seen": 1047208960 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034479438314944836, + "loss": 3.0181, + "theoretical_loss": 3.6332545318072116, + "tokens_seen": 1047274496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034478435305917754, + "loss": 3.0137, + "theoretical_loss": 3.6332330798067884, + "tokens_seen": 1047340032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447743229689067, + "loss": 2.9507, + "theoretical_loss": 3.633211629524482, + "tokens_seen": 1047405568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447642928786359, + "loss": 2.9186, + "theoretical_loss": 3.633190180960047, + "tokens_seen": 1047471104 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447542627883651, + "loss": 2.9093, + "theoretical_loss": 3.633168734113238, + "tokens_seen": 1047536640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447442326980943, + "loss": 2.7881, + "theoretical_loss": 3.633147288983811, + "tokens_seen": 1047602176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034473420260782345, + "loss": 2.9612, + "theoretical_loss": 3.63312584557152, + "tokens_seen": 1047667712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447241725175527, + "loss": 2.9116, + "theoretical_loss": 3.6331044038761204, + "tokens_seen": 1047733248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003447141424272818, + "loss": 2.8966, + "theoretical_loss": 3.633082963897367, + "tokens_seen": 1047798784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034470411233701104, + "loss": 2.9355, + "theoretical_loss": 3.633061525635016, + "tokens_seen": 1047864320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446940822467402, + "loss": 2.9358, + "theoretical_loss": 3.6330400890888224, + "tokens_seen": 1047929856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446840521564694, + "loss": 2.9263, + "theoretical_loss": 3.6330186542585405, + "tokens_seen": 1047995392 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446740220661986, + "loss": 2.9376, + "theoretical_loss": 3.632997221143927, + "tokens_seen": 1048060928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034466399197592777, + "loss": 2.8919, + "theoretical_loss": 3.6329757897447363, + "tokens_seen": 1048126464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034465396188565695, + "loss": 2.7922, + "theoretical_loss": 3.632954360060725, + "tokens_seen": 1048192000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446439317953862, + "loss": 2.8743, + "theoretical_loss": 3.632932932091647, + "tokens_seen": 1048257536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446339017051153, + "loss": 2.9688, + "theoretical_loss": 3.632911505837259, + "tokens_seen": 1048323072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034462387161484455, + "loss": 2.7816, + "theoretical_loss": 3.632890081297316, + "tokens_seen": 1048388608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034461384152457373, + "loss": 2.822, + "theoretical_loss": 3.6328686584715744, + "tokens_seen": 1048454144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003446038114343029, + "loss": 2.8609, + "theoretical_loss": 3.6328472373597895, + "tokens_seen": 1048519680 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1683900, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7902274131774902, + "objective/train/theoretical_loss": 3.63284188234962, + "objective/train/tokens_used": 1068996064, + "theoretical_loss": 3.63284188234962, + "tokens_seen": 1048536064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034459378134403214, + "loss": 2.8752, + "theoretical_loss": 3.6328258179617166, + "tokens_seen": 1048585216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034458375125376127, + "loss": 2.7439, + "theoretical_loss": 3.6328044002771125, + "tokens_seen": 1048650752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003445737211634905, + "loss": 2.9253, + "theoretical_loss": 3.632782984305732, + "tokens_seen": 1048716288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003445636910732197, + "loss": 3.0828, + "theoretical_loss": 3.6327615700473324, + "tokens_seen": 1048781824 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034455366098294887, + "loss": 2.8492, + "theoretical_loss": 3.632740157501668, + "tokens_seen": 1048847360 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034454363089267805, + "loss": 2.9416, + "theoretical_loss": 3.6327187466684956, + "tokens_seen": 1048912896 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034453360080240723, + "loss": 2.9195, + "theoretical_loss": 3.6326973375475715, + "tokens_seen": 1048978432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003445235707121364, + "loss": 2.9205, + "theoretical_loss": 3.6326759301386513, + "tokens_seen": 1049043968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034451354062186565, + "loss": 3.0163, + "theoretical_loss": 3.6326545244414916, + "tokens_seen": 1049109504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003445035105315948, + "loss": 2.8364, + "theoretical_loss": 3.632633120455848, + "tokens_seen": 1049175040 + }, + { + "epoch": 3.01, + "learning_rate": 0.000344493480441324, + "loss": 2.8637, + "theoretical_loss": 3.6326117181814777, + "tokens_seen": 1049240576 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034448345035105314, + "loss": 2.9571, + "theoretical_loss": 3.6325903176181358, + "tokens_seen": 1049306112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034447342026078237, + "loss": 2.896, + "theoretical_loss": 3.63256891876558, + "tokens_seen": 1049371648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034446339017051155, + "loss": 2.9194, + "theoretical_loss": 3.6325475216235663, + "tokens_seen": 1049437184 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034445336008024073, + "loss": 2.9042, + "theoretical_loss": 3.6325261261918502, + "tokens_seen": 1049502720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003444433299899699, + "loss": 2.8453, + "theoretical_loss": 3.632504732470189, + "tokens_seen": 1049568256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034443329989969915, + "loss": 2.9036, + "theoretical_loss": 3.6324833404583394, + "tokens_seen": 1049633792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003444232698094283, + "loss": 2.7814, + "theoretical_loss": 3.6324619501560576, + "tokens_seen": 1049699328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003444132397191575, + "loss": 2.9613, + "theoretical_loss": 3.632440561563101, + "tokens_seen": 1049764864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034440320962888664, + "loss": 2.9787, + "theoretical_loss": 3.632419174679225, + "tokens_seen": 1049830400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003443931795386159, + "loss": 2.8968, + "theoretical_loss": 3.632397789504187, + "tokens_seen": 1049895936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034438314944834506, + "loss": 3.0333, + "theoretical_loss": 3.6323764060377446, + "tokens_seen": 1049961472 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034437311935807424, + "loss": 2.8555, + "theoretical_loss": 3.6323550242796534, + "tokens_seen": 1050027008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003443630892678034, + "loss": 3.1289, + "theoretical_loss": 3.632333644229671, + "tokens_seen": 1050092544 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003443530591775326, + "loss": 2.9375, + "theoretical_loss": 3.6323122658875544, + "tokens_seen": 1050158080 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1686597, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9695985317230225, + "objective/train/theoretical_loss": 3.632306921568851, + "objective/train/tokens_used": 1070634464, + "theoretical_loss": 3.632306921568851, + "tokens_seen": 1050174464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003443430290872618, + "loss": 2.8821, + "theoretical_loss": 3.63229088925306, + "tokens_seen": 1050223616 + }, + { + "epoch": 3.01, + "learning_rate": 0.000344332998996991, + "loss": 2.8405, + "theoretical_loss": 3.632269514325946, + "tokens_seen": 1050289152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034432296890672014, + "loss": 3.0446, + "theoretical_loss": 3.6322481411059684, + "tokens_seen": 1050354688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003443129388164494, + "loss": 2.9512, + "theoretical_loss": 3.6322267695928847, + "tokens_seen": 1050420224 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034430290872617856, + "loss": 2.7754, + "theoretical_loss": 3.632205399786452, + "tokens_seen": 1050485760 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034429287863590774, + "loss": 2.932, + "theoretical_loss": 3.632184031686428, + "tokens_seen": 1050551296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442828485456369, + "loss": 2.7159, + "theoretical_loss": 3.6321626652925696, + "tokens_seen": 1050616832 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442728184553661, + "loss": 2.8472, + "theoretical_loss": 3.6321413006046344, + "tokens_seen": 1050682368 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442627883650953, + "loss": 2.8577, + "theoretical_loss": 3.632119937622379, + "tokens_seen": 1050747904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442527582748245, + "loss": 2.9372, + "theoretical_loss": 3.632098576345562, + "tokens_seen": 1050813440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034424272818455365, + "loss": 2.9315, + "theoretical_loss": 3.6320772167739404, + "tokens_seen": 1050878976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442326980942829, + "loss": 2.9587, + "theoretical_loss": 3.632055858907272, + "tokens_seen": 1050944512 + }, + { + "epoch": 3.01, + "learning_rate": 0.000344222668004012, + "loss": 2.9454, + "theoretical_loss": 3.632034502745314, + "tokens_seen": 1051010048 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034421263791374124, + "loss": 2.9053, + "theoretical_loss": 3.6320131482878235, + "tokens_seen": 1051075584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003442026078234704, + "loss": 2.9068, + "theoretical_loss": 3.6319917955345598, + "tokens_seen": 1051141120 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441925777331996, + "loss": 2.969, + "theoretical_loss": 3.6319704444852796, + "tokens_seen": 1051206656 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441825476429288, + "loss": 2.7918, + "theoretical_loss": 3.6319490951397406, + "tokens_seen": 1051272192 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034417251755265797, + "loss": 2.9944, + "theoretical_loss": 3.631927747497701, + "tokens_seen": 1051337728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034416248746238715, + "loss": 3.0034, + "theoretical_loss": 3.631906401558919, + "tokens_seen": 1051403264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441524573721164, + "loss": 2.8342, + "theoretical_loss": 3.6318850573231516, + "tokens_seen": 1051468800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441424272818455, + "loss": 2.8569, + "theoretical_loss": 3.631863714790158, + "tokens_seen": 1051534336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034413239719157475, + "loss": 2.8444, + "theoretical_loss": 3.631842373959695, + "tokens_seen": 1051599872 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034412236710130393, + "loss": 2.9622, + "theoretical_loss": 3.631821034831521, + "tokens_seen": 1051665408 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441123370110331, + "loss": 3.0332, + "theoretical_loss": 3.6317996974053957, + "tokens_seen": 1051730944 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003441023069207623, + "loss": 2.9733, + "theoretical_loss": 3.631778361681075, + "tokens_seen": 1051796480 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1689245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5542352199554443, + "objective/train/theoretical_loss": 3.631773028015874, + "objective/train/tokens_used": 1072272864, + "theoretical_loss": 3.631773028015874, + "tokens_seen": 1051812864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034409227683049147, + "loss": 2.9157, + "theoretical_loss": 3.6317570276583186, + "tokens_seen": 1051862016 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034408224674022065, + "loss": 2.9163, + "theoretical_loss": 3.631735695336884, + "tokens_seen": 1051927552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003440722166499499, + "loss": 2.8229, + "theoretical_loss": 3.6317143647165304, + "tokens_seen": 1051993088 + }, + { + "epoch": 3.01, + "learning_rate": 0.000344062186559679, + "loss": 2.686, + "theoretical_loss": 3.6316930357970154, + "tokens_seen": 1052058624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034405215646940825, + "loss": 2.8541, + "theoretical_loss": 3.6316717085780983, + "tokens_seen": 1052124160 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003440421263791374, + "loss": 2.7988, + "theoretical_loss": 3.6316503830595366, + "tokens_seen": 1052189696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003440320962888666, + "loss": 3.0341, + "theoretical_loss": 3.631629059241089, + "tokens_seen": 1052255232 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003440220661985958, + "loss": 2.7975, + "theoretical_loss": 3.631607737122515, + "tokens_seen": 1052320768 + }, + { + "epoch": 3.01, + "learning_rate": 0.000344012036108325, + "loss": 2.8636, + "theoretical_loss": 3.6315864167035725, + "tokens_seen": 1052386304 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034400200601805416, + "loss": 2.7883, + "theoretical_loss": 3.63156509798402, + "tokens_seen": 1052451840 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034399197592778334, + "loss": 2.8457, + "theoretical_loss": 3.6315437809636166, + "tokens_seen": 1052517376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003439819458375125, + "loss": 2.8396, + "theoretical_loss": 3.6315224656421212, + "tokens_seen": 1052582912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034397191574724175, + "loss": 2.8814, + "theoretical_loss": 3.6315011520192924, + "tokens_seen": 1052648448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003439618856569709, + "loss": 2.8657, + "theoretical_loss": 3.6314798400948893, + "tokens_seen": 1052713984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003439518555667001, + "loss": 2.8709, + "theoretical_loss": 3.631458529868671, + "tokens_seen": 1052779520 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003439418254764293, + "loss": 2.9098, + "theoretical_loss": 3.631437221340396, + "tokens_seen": 1052845056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003439317953861585, + "loss": 2.7446, + "theoretical_loss": 3.6314159145098226, + "tokens_seen": 1052910592 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034392176529588766, + "loss": 2.8006, + "theoretical_loss": 3.631394609376712, + "tokens_seen": 1052976128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034391173520561684, + "loss": 2.6777, + "theoretical_loss": 3.631373305940821, + "tokens_seen": 1053041664 + }, + { + "epoch": 3.01, + "learning_rate": 0.000343901705115346, + "loss": 2.7986, + "theoretical_loss": 3.6313520042019105, + "tokens_seen": 1053107200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034389167502507526, + "loss": 2.8487, + "theoretical_loss": 3.6313307041597396, + "tokens_seen": 1053172736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003438816449348044, + "loss": 2.9277, + "theoretical_loss": 3.6313094058140667, + "tokens_seen": 1053238272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003438716148445336, + "loss": 2.9292, + "theoretical_loss": 3.631288109164651, + "tokens_seen": 1053303808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034386158475426275, + "loss": 2.9168, + "theoretical_loss": 3.6312668142112536, + "tokens_seen": 1053369344 + }, + { + "epoch": 3.01, + "learning_rate": 0.000343851554663992, + "loss": 2.8124, + "theoretical_loss": 3.631245520953632, + "tokens_seen": 1053434880 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1691935, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.00308895111084, + "objective/train/theoretical_loss": 3.6312401979041637, + "objective/train/tokens_used": 1073911264, + "theoretical_loss": 3.6312401979041637, + "tokens_seen": 1053451264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003438415245737212, + "loss": 2.7028, + "theoretical_loss": 3.6312242293915467, + "tokens_seen": 1053500416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034383149448345034, + "loss": 2.8645, + "theoretical_loss": 3.6312029395247567, + "tokens_seen": 1053565952 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003438214643931796, + "loss": 2.8716, + "theoretical_loss": 3.631181651353022, + "tokens_seen": 1053631488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034381143430290876, + "loss": 2.9171, + "theoretical_loss": 3.631160364876102, + "tokens_seen": 1053697024 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034380140421263794, + "loss": 3.0103, + "theoretical_loss": 3.6311390800937566, + "tokens_seen": 1053762560 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437913741223671, + "loss": 2.8842, + "theoretical_loss": 3.6311177970057456, + "tokens_seen": 1053828096 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437813440320963, + "loss": 2.9222, + "theoretical_loss": 3.631096515611828, + "tokens_seen": 1053893632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437713139418255, + "loss": 2.7968, + "theoretical_loss": 3.631075235911764, + "tokens_seen": 1053959168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437612838515547, + "loss": 2.8489, + "theoretical_loss": 3.631053957905314, + "tokens_seen": 1054024704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034375125376128385, + "loss": 2.9385, + "theoretical_loss": 3.6310326815922376, + "tokens_seen": 1054090240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437412236710131, + "loss": 2.9037, + "theoretical_loss": 3.631011406972294, + "tokens_seen": 1054155776 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437311935807422, + "loss": 2.9167, + "theoretical_loss": 3.630990134045245, + "tokens_seen": 1054221312 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034372116349047144, + "loss": 2.8421, + "theoretical_loss": 3.6309688628108487, + "tokens_seen": 1054286848 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437111334002006, + "loss": 2.9542, + "theoretical_loss": 3.630947593268866, + "tokens_seen": 1054352384 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003437011033099298, + "loss": 2.7836, + "theoretical_loss": 3.6309263254190576, + "tokens_seen": 1054417920 + }, + { + "epoch": 3.01, + "learning_rate": 0.000343691073219659, + "loss": 3.019, + "theoretical_loss": 3.6309050592611833, + "tokens_seen": 1054483456 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034368104312938817, + "loss": 2.9358, + "theoretical_loss": 3.630883794795003, + "tokens_seen": 1054548992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034367101303911735, + "loss": 2.9131, + "theoretical_loss": 3.6308625320202776, + "tokens_seen": 1054614528 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003436609829488466, + "loss": 2.8866, + "theoretical_loss": 3.6308412709367675, + "tokens_seen": 1054680064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003436509528585757, + "loss": 2.9949, + "theoretical_loss": 3.630820011544232, + "tokens_seen": 1054745600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034364092276830495, + "loss": 2.8652, + "theoretical_loss": 3.630798753842433, + "tokens_seen": 1054811136 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034363089267803413, + "loss": 2.7836, + "theoretical_loss": 3.63077749783113, + "tokens_seen": 1054876672 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003436208625877633, + "loss": 2.8589, + "theoretical_loss": 3.630756243510084, + "tokens_seen": 1054942208 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003436108324974925, + "loss": 2.8093, + "theoretical_loss": 3.630734990879055, + "tokens_seen": 1055007744 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034360080240722167, + "loss": 2.9076, + "theoretical_loss": 3.6307137399378044, + "tokens_seen": 1055073280 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1693381, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7660012245178223, + "objective/train/theoretical_loss": 3.630708427466492, + "objective/train/tokens_used": 1075549664, + "theoretical_loss": 3.630708427466492, + "tokens_seen": 1055089664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034359077231695085, + "loss": 2.863, + "theoretical_loss": 3.6306924906860933, + "tokens_seen": 1055138816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003435807422266801, + "loss": 2.9265, + "theoretical_loss": 3.630671243123681, + "tokens_seen": 1055204352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003435707121364092, + "loss": 2.9151, + "theoretical_loss": 3.6306499972503294, + "tokens_seen": 1055269888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034356068204613845, + "loss": 2.8872, + "theoretical_loss": 3.6306287530657992, + "tokens_seen": 1055335424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003435506519558676, + "loss": 2.9322, + "theoretical_loss": 3.630607510569851, + "tokens_seen": 1055400960 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003435406218655968, + "loss": 3.0044, + "theoretical_loss": 3.630586269762246, + "tokens_seen": 1055466496 + }, + { + "epoch": 3.01, + "learning_rate": 0.000343530591775326, + "loss": 2.9249, + "theoretical_loss": 3.630565030642745, + "tokens_seen": 1055532032 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003435205616850552, + "loss": 2.8395, + "theoretical_loss": 3.6305437932111086, + "tokens_seen": 1055597568 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034351053159478436, + "loss": 2.8529, + "theoretical_loss": 3.630522557467099, + "tokens_seen": 1055663104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034350050150451354, + "loss": 2.787, + "theoretical_loss": 3.630501323410477, + "tokens_seen": 1055728640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434904714142427, + "loss": 2.8678, + "theoretical_loss": 3.6304800910410027, + "tokens_seen": 1055794176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034348044132397195, + "loss": 2.8076, + "theoretical_loss": 3.6304588603584387, + "tokens_seen": 1055859712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434704112337011, + "loss": 2.8647, + "theoretical_loss": 3.6304376313625455, + "tokens_seen": 1055925248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434603811434303, + "loss": 3.0539, + "theoretical_loss": 3.6304164040530846, + "tokens_seen": 1055990784 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434503510531595, + "loss": 2.7731, + "theoretical_loss": 3.6303951784298176, + "tokens_seen": 1056056320 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434403209628887, + "loss": 2.9271, + "theoretical_loss": 3.630373954492506, + "tokens_seen": 1056121856 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034343029087261786, + "loss": 2.939, + "theoretical_loss": 3.630352732240911, + "tokens_seen": 1056187392 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034342026078234704, + "loss": 2.7556, + "theoretical_loss": 3.6303315116747936, + "tokens_seen": 1056252928 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003434102306920762, + "loss": 2.8679, + "theoretical_loss": 3.6303102927939164, + "tokens_seen": 1056318464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034340020060180546, + "loss": 2.9887, + "theoretical_loss": 3.63028907559804, + "tokens_seen": 1056384000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433901705115346, + "loss": 2.8518, + "theoretical_loss": 3.630267860086928, + "tokens_seen": 1056449536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433801404212638, + "loss": 2.9198, + "theoretical_loss": 3.6302466462603395, + "tokens_seen": 1056515072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034337011033099295, + "loss": 2.823, + "theoretical_loss": 3.6302254341180378, + "tokens_seen": 1056580608 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433600802407222, + "loss": 2.8793, + "theoretical_loss": 3.630204223659785, + "tokens_seen": 1056646144 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034335005015045136, + "loss": 2.8256, + "theoretical_loss": 3.6301830148853416, + "tokens_seen": 1056711680 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1696185, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1694750785827637, + "objective/train/theoretical_loss": 3.6301777129547985, + "objective/train/tokens_used": 1077188064, + "theoretical_loss": 3.6301777129547985, + "tokens_seen": 1056728064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034334002006018054, + "loss": 2.8715, + "theoretical_loss": 3.630161807794471, + "tokens_seen": 1056777216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433299899699097, + "loss": 3.04, + "theoretical_loss": 3.6301406023869345, + "tokens_seen": 1056842752 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433199598796389, + "loss": 2.9187, + "theoretical_loss": 3.630119398662494, + "tokens_seen": 1056908288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003433099297893681, + "loss": 2.8614, + "theoretical_loss": 3.630098196620911, + "tokens_seen": 1056973824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003432998996990973, + "loss": 2.9436, + "theoretical_loss": 3.6300769962619492, + "tokens_seen": 1057039360 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034328986960882645, + "loss": 2.8769, + "theoretical_loss": 3.630055797585369, + "tokens_seen": 1057104896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003432798395185557, + "loss": 2.9263, + "theoretical_loss": 3.630034600590934, + "tokens_seen": 1057170432 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034326980942828487, + "loss": 2.9379, + "theoretical_loss": 3.630013405278406, + "tokens_seen": 1057235968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034325977933801405, + "loss": 2.8096, + "theoretical_loss": 3.6299922116475467, + "tokens_seen": 1057301504 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034324974924774323, + "loss": 2.7892, + "theoretical_loss": 3.6299710196981194, + "tokens_seen": 1057367040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003432397191574724, + "loss": 2.659, + "theoretical_loss": 3.6299498294298855, + "tokens_seen": 1057432576 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003432296890672016, + "loss": 2.7731, + "theoretical_loss": 3.6299286408426084, + "tokens_seen": 1057498112 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003432196589769308, + "loss": 2.9308, + "theoretical_loss": 3.6299074539360503, + "tokens_seen": 1057563648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034320962888665995, + "loss": 2.8336, + "theoretical_loss": 3.6298862687099733, + "tokens_seen": 1057629184 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431995987963892, + "loss": 2.8989, + "theoretical_loss": 3.6298650851641403, + "tokens_seen": 1057694720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431895687061183, + "loss": 2.7429, + "theoretical_loss": 3.6298439032983136, + "tokens_seen": 1057760256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034317953861584755, + "loss": 2.6985, + "theoretical_loss": 3.629822723112257, + "tokens_seen": 1057825792 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034316950852557673, + "loss": 2.9279, + "theoretical_loss": 3.629801544605732, + "tokens_seen": 1057891328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431594784353059, + "loss": 2.9887, + "theoretical_loss": 3.629780367778502, + "tokens_seen": 1057956864 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431494483450351, + "loss": 2.9309, + "theoretical_loss": 3.6297591926303294, + "tokens_seen": 1058022400 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034313941825476433, + "loss": 2.9597, + "theoretical_loss": 3.629738019160978, + "tokens_seen": 1058087936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034312938816449345, + "loss": 2.9034, + "theoretical_loss": 3.6297168473702097, + "tokens_seen": 1058153472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431193580742227, + "loss": 2.8164, + "theoretical_loss": 3.6296956772577875, + "tokens_seen": 1058219008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003431093279839518, + "loss": 2.9474, + "theoretical_loss": 3.6296745088234754, + "tokens_seen": 1058284544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034309929789368105, + "loss": 2.8884, + "theoretical_loss": 3.629653342067036, + "tokens_seen": 1058350080 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1699207, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5064847469329834, + "objective/train/theoretical_loss": 3.629648050640066, + "objective/train/tokens_used": 1078826464, + "theoretical_loss": 3.629648050640066, + "tokens_seen": 1058366464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003430892678034103, + "loss": 2.7437, + "theoretical_loss": 3.6296321769882316, + "tokens_seen": 1058415616 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003430792377131394, + "loss": 2.8663, + "theoretical_loss": 3.6296110135868265, + "tokens_seen": 1058481152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034306920762286865, + "loss": 2.7905, + "theoretical_loss": 3.6295898518625833, + "tokens_seen": 1058546688 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003430591775325978, + "loss": 2.8479, + "theoretical_loss": 3.6295686918152654, + "tokens_seen": 1058612224 + }, + { + "epoch": 3.01, + "learning_rate": 0.000343049147442327, + "loss": 2.8745, + "theoretical_loss": 3.629547533444636, + "tokens_seen": 1058677760 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003430391173520562, + "loss": 2.724, + "theoretical_loss": 3.6295263767504586, + "tokens_seen": 1058743296 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003430290872617854, + "loss": 2.7686, + "theoretical_loss": 3.6295052217324972, + "tokens_seen": 1058808832 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034301905717151456, + "loss": 2.8047, + "theoretical_loss": 3.6294840683905143, + "tokens_seen": 1058874368 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034300902708124374, + "loss": 2.8603, + "theoretical_loss": 3.6294629167242736, + "tokens_seen": 1058939904 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429989969909729, + "loss": 2.8464, + "theoretical_loss": 3.629441766733539, + "tokens_seen": 1059005440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034298896690070215, + "loss": 2.844, + "theoretical_loss": 3.6294206184180737, + "tokens_seen": 1059070976 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429789368104313, + "loss": 2.8476, + "theoretical_loss": 3.629399471777642, + "tokens_seen": 1059136512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429689067201605, + "loss": 2.9508, + "theoretical_loss": 3.6293783268120072, + "tokens_seen": 1059202048 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429588766298897, + "loss": 2.821, + "theoretical_loss": 3.629357183520933, + "tokens_seen": 1059267584 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429488465396189, + "loss": 2.8621, + "theoretical_loss": 3.6293360419041827, + "tokens_seen": 1059333120 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034293881644934806, + "loss": 2.788, + "theoretical_loss": 3.6293149019615214, + "tokens_seen": 1059398656 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034292878635907724, + "loss": 2.85, + "theoretical_loss": 3.6292937636927114, + "tokens_seen": 1059464192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003429187562688064, + "loss": 2.723, + "theoretical_loss": 3.629272627097518, + "tokens_seen": 1059529728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034290872617853566, + "loss": 2.8302, + "theoretical_loss": 3.6292514921757046, + "tokens_seen": 1059595264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428986960882648, + "loss": 2.8705, + "theoretical_loss": 3.6292303589270354, + "tokens_seen": 1059660800 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342888665997994, + "loss": 2.8221, + "theoretical_loss": 3.629209227351274, + "tokens_seen": 1059726336 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034287863590772315, + "loss": 2.9427, + "theoretical_loss": 3.6291880974481856, + "tokens_seen": 1059791872 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428686058174524, + "loss": 2.9757, + "theoretical_loss": 3.6291669692175335, + "tokens_seen": 1059857408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034285857572718156, + "loss": 3.0036, + "theoretical_loss": 3.6291458426590815, + "tokens_seen": 1059922944 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034284854563691074, + "loss": 3.0041, + "theoretical_loss": 3.629124717772595, + "tokens_seen": 1059988480 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1701903, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8911871910095215, + "objective/train/theoretical_loss": 3.62911943681219, + "objective/train/tokens_used": 1080464864, + "theoretical_loss": 3.62911943681219, + "tokens_seen": 1060004864 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428385155466399, + "loss": 2.8443, + "theoretical_loss": 3.629103594557838, + "tokens_seen": 1060054016 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428284854563691, + "loss": 2.997, + "theoretical_loss": 3.629082473014574, + "tokens_seen": 1060119552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428184553660983, + "loss": 2.8944, + "theoretical_loss": 3.6290613531425686, + "tokens_seen": 1060185088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003428084252758275, + "loss": 2.7657, + "theoretical_loss": 3.6290402349415856, + "tokens_seen": 1060250624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034279839518555665, + "loss": 3.0262, + "theoretical_loss": 3.6290191184113896, + "tokens_seen": 1060316160 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003427883650952859, + "loss": 2.9352, + "theoretical_loss": 3.6289980035517453, + "tokens_seen": 1060381696 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034277833500501507, + "loss": 2.8501, + "theoretical_loss": 3.6289768903624173, + "tokens_seen": 1060447232 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034276830491474425, + "loss": 2.804, + "theoretical_loss": 3.62895577884317, + "tokens_seen": 1060512768 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034275827482447343, + "loss": 2.9368, + "theoretical_loss": 3.628934668993768, + "tokens_seen": 1060578304 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003427482447342026, + "loss": 2.9125, + "theoretical_loss": 3.628913560813977, + "tokens_seen": 1060643840 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003427382146439318, + "loss": 2.8972, + "theoretical_loss": 3.6288924543035606, + "tokens_seen": 1060709376 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342728184553661, + "loss": 2.8532, + "theoretical_loss": 3.628871349462284, + "tokens_seen": 1060774912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034271815446339015, + "loss": 2.7428, + "theoretical_loss": 3.628850246289913, + "tokens_seen": 1060840448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003427081243731194, + "loss": 2.8402, + "theoretical_loss": 3.628829144786211, + "tokens_seen": 1060905984 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003426980942828485, + "loss": 2.956, + "theoretical_loss": 3.6288080449509446, + "tokens_seen": 1060971520 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034268806419257775, + "loss": 2.8772, + "theoretical_loss": 3.628786946783878, + "tokens_seen": 1061037056 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034267803410230693, + "loss": 2.9985, + "theoretical_loss": 3.6287658502847755, + "tokens_seen": 1061102592 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003426680040120361, + "loss": 2.8272, + "theoretical_loss": 3.6287447554534036, + "tokens_seen": 1061168128 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003426579739217653, + "loss": 3.0282, + "theoretical_loss": 3.628723662289527, + "tokens_seen": 1061233664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034264794383149453, + "loss": 2.8289, + "theoretical_loss": 3.6287025707929104, + "tokens_seen": 1061299200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034263791374122366, + "loss": 2.8679, + "theoretical_loss": 3.6286814809633197, + "tokens_seen": 1061364736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003426278836509529, + "loss": 2.8907, + "theoretical_loss": 3.62866039280052, + "tokens_seen": 1061430272 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342617853560682, + "loss": 2.8834, + "theoretical_loss": 3.628639306304277, + "tokens_seen": 1061495808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034260782347041125, + "loss": 2.8905, + "theoretical_loss": 3.628618221474355, + "tokens_seen": 1061561344 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034259779338014043, + "loss": 2.9021, + "theoretical_loss": 3.6285971383105213, + "tokens_seen": 1061626880 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1704784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.762840747833252, + "objective/train/theoretical_loss": 3.6285918677798614, + "objective/train/tokens_used": 1082103264, + "theoretical_loss": 3.6285918677798614, + "tokens_seen": 1061643264 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425877632898696, + "loss": 2.7432, + "theoretical_loss": 3.62857605681254, + "tokens_seen": 1061692416 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425777331995988, + "loss": 2.8874, + "theoretical_loss": 3.628554976980177, + "tokens_seen": 1061757952 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342567703109328, + "loss": 2.9253, + "theoretical_loss": 3.6285338988131977, + "tokens_seen": 1061823488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034255767301905716, + "loss": 2.9495, + "theoretical_loss": 3.628512822311368, + "tokens_seen": 1061889024 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425476429287864, + "loss": 2.7098, + "theoretical_loss": 3.6284917474744542, + "tokens_seen": 1061954560 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425376128385155, + "loss": 2.8603, + "theoretical_loss": 3.6284706743022213, + "tokens_seen": 1062020096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034252758274824476, + "loss": 2.9358, + "theoretical_loss": 3.6284496027944346, + "tokens_seen": 1062085632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425175526579739, + "loss": 2.6202, + "theoretical_loss": 3.628428532950861, + "tokens_seen": 1062151168 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003425075225677031, + "loss": 2.9186, + "theoretical_loss": 3.628407464771266, + "tokens_seen": 1062216704 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424974924774323, + "loss": 2.9001, + "theoretical_loss": 3.6283863982554156, + "tokens_seen": 1062282240 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424874623871615, + "loss": 2.8982, + "theoretical_loss": 3.6283653334030754, + "tokens_seen": 1062347776 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034247743229689066, + "loss": 2.8318, + "theoretical_loss": 3.6283442702140123, + "tokens_seen": 1062413312 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424674022066199, + "loss": 2.9928, + "theoretical_loss": 3.6283232086879913, + "tokens_seen": 1062478848 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342457372116349, + "loss": 2.9735, + "theoretical_loss": 3.6283021488247793, + "tokens_seen": 1062544384 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034244734202607826, + "loss": 2.8813, + "theoretical_loss": 3.6282810906241423, + "tokens_seen": 1062609920 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424373119358074, + "loss": 2.9404, + "theoretical_loss": 3.628260034085846, + "tokens_seen": 1062675456 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424272818455366, + "loss": 2.8857, + "theoretical_loss": 3.628238979209658, + "tokens_seen": 1062740992 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003424172517552658, + "loss": 3.0072, + "theoretical_loss": 3.628217925995343, + "tokens_seen": 1062806528 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342407221664995, + "loss": 2.801, + "theoretical_loss": 3.6281968744426676, + "tokens_seen": 1062872064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034239719157472416, + "loss": 2.9335, + "theoretical_loss": 3.6281758245513993, + "tokens_seen": 1062937600 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034238716148445335, + "loss": 2.9692, + "theoretical_loss": 3.628154776321304, + "tokens_seen": 1063003136 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423771313941825, + "loss": 2.9858, + "theoretical_loss": 3.628133729752148, + "tokens_seen": 1063068672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034236710130391176, + "loss": 2.8939, + "theoretical_loss": 3.628112684843698, + "tokens_seen": 1063134208 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423570712136409, + "loss": 2.9392, + "theoretical_loss": 3.628091641595721, + "tokens_seen": 1063199744 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423470411233701, + "loss": 2.9314, + "theoretical_loss": 3.628070600007983, + "tokens_seen": 1063265280 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1706210, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.132633924484253, + "objective/train/theoretical_loss": 3.6280653398704334, + "objective/train/tokens_used": 1083741664, + "theoretical_loss": 3.6280653398704334, + "tokens_seen": 1063281664 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423370110330993, + "loss": 3.0321, + "theoretical_loss": 3.6280495600802505, + "tokens_seen": 1063330816 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423269809428285, + "loss": 2.6839, + "theoretical_loss": 3.628028521812291, + "tokens_seen": 1063396352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003423169508525577, + "loss": 2.9013, + "theoretical_loss": 3.628007485203871, + "tokens_seen": 1063461888 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034230692076228685, + "loss": 2.9535, + "theoretical_loss": 3.627986450254758, + "tokens_seen": 1063527424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003422968906720161, + "loss": 2.8495, + "theoretical_loss": 3.6279654169647175, + "tokens_seen": 1063592960 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034228686058174527, + "loss": 2.9583, + "theoretical_loss": 3.6279443853335165, + "tokens_seen": 1063658496 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034227683049147445, + "loss": 2.9092, + "theoretical_loss": 3.627923355360924, + "tokens_seen": 1063724032 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034226680040120363, + "loss": 3.0731, + "theoretical_loss": 3.6279023270467046, + "tokens_seen": 1063789568 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003422567703109328, + "loss": 2.907, + "theoretical_loss": 3.6278813003906265, + "tokens_seen": 1063855104 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342246740220662, + "loss": 2.9993, + "theoretical_loss": 3.627860275392457, + "tokens_seen": 1063920640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003422367101303912, + "loss": 2.9092, + "theoretical_loss": 3.627839252051963, + "tokens_seen": 1063986176 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034222668004012035, + "loss": 2.8842, + "theoretical_loss": 3.6278182303689115, + "tokens_seen": 1064051712 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003422166499498496, + "loss": 2.7655, + "theoretical_loss": 3.62779721034307, + "tokens_seen": 1064117248 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003422066198595787, + "loss": 2.8777, + "theoretical_loss": 3.6277761919742053, + "tokens_seen": 1064182784 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034219658976930795, + "loss": 2.8695, + "theoretical_loss": 3.6277551752620862, + "tokens_seen": 1064248320 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034218655967903713, + "loss": 2.8012, + "theoretical_loss": 3.6277341602064785, + "tokens_seen": 1064313856 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003421765295887663, + "loss": 2.9365, + "theoretical_loss": 3.6277131468071504, + "tokens_seen": 1064379392 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003421664994984955, + "loss": 2.9012, + "theoretical_loss": 3.627692135063869, + "tokens_seen": 1064444928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034215646940822473, + "loss": 2.9247, + "theoretical_loss": 3.627671124976402, + "tokens_seen": 1064510464 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034214643931795386, + "loss": 2.8598, + "theoretical_loss": 3.627650116544517, + "tokens_seen": 1064576000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003421364092276831, + "loss": 3.0275, + "theoretical_loss": 3.6276291097679825, + "tokens_seen": 1064641536 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003421263791374122, + "loss": 2.7846, + "theoretical_loss": 3.6276081046465647, + "tokens_seen": 1064707072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034211634904714145, + "loss": 3.0064, + "theoretical_loss": 3.6275871011800325, + "tokens_seen": 1064772608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034210631895687063, + "loss": 2.9108, + "theoretical_loss": 3.6275660993681527, + "tokens_seen": 1064838144 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420962888665998, + "loss": 2.9827, + "theoretical_loss": 3.6275450992106935, + "tokens_seen": 1064903680 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1708934, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.649383306503296, + "objective/train/theoretical_loss": 3.6275398494298052, + "objective/train/tokens_used": 1085380064, + "theoretical_loss": 3.6275398494298052, + "tokens_seen": 1064920064 + }, + { + "epoch": 3.01, + "learning_rate": 0.000342086258776329, + "loss": 2.8199, + "theoretical_loss": 3.6275241007074235, + "tokens_seen": 1064969216 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420762286860582, + "loss": 3.0321, + "theoretical_loss": 3.6275031038581096, + "tokens_seen": 1065034752 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034206619859578736, + "loss": 2.6736, + "theoretical_loss": 3.6274821086625204, + "tokens_seen": 1065100288 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420561685055166, + "loss": 2.8813, + "theoretical_loss": 3.6274611151204237, + "tokens_seen": 1065165824 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420461384152457, + "loss": 2.8137, + "theoretical_loss": 3.6274401232315867, + "tokens_seen": 1065231360 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034203610832497496, + "loss": 2.9296, + "theoretical_loss": 3.6274191329957794, + "tokens_seen": 1065296896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420260782347041, + "loss": 2.9444, + "theoretical_loss": 3.6273981444127683, + "tokens_seen": 1065362432 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420160481444333, + "loss": 2.9589, + "theoretical_loss": 3.6273771574823224, + "tokens_seen": 1065427968 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003420060180541625, + "loss": 2.9517, + "theoretical_loss": 3.62735617220421, + "tokens_seen": 1065493504 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419959879638917, + "loss": 2.8862, + "theoretical_loss": 3.6273351885781984, + "tokens_seen": 1065559040 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034198595787362086, + "loss": 3.0142, + "theoretical_loss": 3.6273142066040567, + "tokens_seen": 1065624576 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419759277833501, + "loss": 2.9476, + "theoretical_loss": 3.627293226281554, + "tokens_seen": 1065690112 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419658976930792, + "loss": 3.0601, + "theoretical_loss": 3.627272247610457, + "tokens_seen": 1065755648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034195586760280846, + "loss": 3.0061, + "theoretical_loss": 3.627251270590536, + "tokens_seen": 1065821184 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419458375125376, + "loss": 2.8949, + "theoretical_loss": 3.6272302952215583, + "tokens_seen": 1065886720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419358074222668, + "loss": 2.991, + "theoretical_loss": 3.627209321503293, + "tokens_seen": 1065952256 + }, + { + "epoch": 3.01, + "learning_rate": 0.000341925777331996, + "loss": 2.9017, + "theoretical_loss": 3.627188349435508, + "tokens_seen": 1066017792 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003419157472417252, + "loss": 2.845, + "theoretical_loss": 3.6271673790179726, + "tokens_seen": 1066083328 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034190571715145436, + "loss": 2.8539, + "theoretical_loss": 3.627146410250456, + "tokens_seen": 1066148864 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034189568706118355, + "loss": 2.844, + "theoretical_loss": 3.627125443132726, + "tokens_seen": 1066214400 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003418856569709127, + "loss": 2.9417, + "theoretical_loss": 3.627104477664552, + "tokens_seen": 1066279936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034187562688064196, + "loss": 2.7813, + "theoretical_loss": 3.6270835138457023, + "tokens_seen": 1066345472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003418655967903711, + "loss": 3.0834, + "theoretical_loss": 3.6270625516759463, + "tokens_seen": 1066411008 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003418555667001003, + "loss": 2.8759, + "theoretical_loss": 3.627041591155053, + "tokens_seen": 1066476544 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034184553660982945, + "loss": 2.9347, + "theoretical_loss": 3.6270206322827905, + "tokens_seen": 1066542080 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1711457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0663483142852783, + "objective/train/theoretical_loss": 3.6270153928222966, + "objective/train/tokens_used": 1087018464, + "theoretical_loss": 3.6270153928222966, + "tokens_seen": 1066558464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003418355065195587, + "loss": 2.98, + "theoretical_loss": 3.6269996750589293, + "tokens_seen": 1066607616 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034182547642928787, + "loss": 2.928, + "theoretical_loss": 3.6269787194832377, + "tokens_seen": 1066673152 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034181544633901705, + "loss": 2.911, + "theoretical_loss": 3.626957765555484, + "tokens_seen": 1066738688 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034180541624874623, + "loss": 2.8361, + "theoretical_loss": 3.6269368132754387, + "tokens_seen": 1066804224 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034179538615847547, + "loss": 3.0226, + "theoretical_loss": 3.6269158626428704, + "tokens_seen": 1066869760 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003417853560682046, + "loss": 2.8841, + "theoretical_loss": 3.626894913657549, + "tokens_seen": 1066935296 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034177532597793383, + "loss": 2.9981, + "theoretical_loss": 3.626873966319242, + "tokens_seen": 1067000832 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034176529588766295, + "loss": 2.8459, + "theoretical_loss": 3.6268530206277214, + "tokens_seen": 1067066368 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003417552657973922, + "loss": 2.8025, + "theoretical_loss": 3.6268320765827546, + "tokens_seen": 1067131904 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034174523570712137, + "loss": 2.9549, + "theoretical_loss": 3.626811134184112, + "tokens_seen": 1067197440 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034173520561685055, + "loss": 2.9561, + "theoretical_loss": 3.6267901934315625, + "tokens_seen": 1067262976 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034172517552657973, + "loss": 2.9927, + "theoretical_loss": 3.626769254324876, + "tokens_seen": 1067328512 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003417151454363089, + "loss": 2.9277, + "theoretical_loss": 3.6267483168638224, + "tokens_seen": 1067394048 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003417051153460381, + "loss": 2.8563, + "theoretical_loss": 3.6267273810481706, + "tokens_seen": 1067459584 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034169508525576733, + "loss": 2.9617, + "theoretical_loss": 3.6267064468776913, + "tokens_seen": 1067525120 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034168505516549646, + "loss": 2.9511, + "theoretical_loss": 3.626685514352153, + "tokens_seen": 1067590656 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003416750250752257, + "loss": 2.9037, + "theoretical_loss": 3.6266645834713263, + "tokens_seen": 1067656192 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003416649949849548, + "loss": 2.9741, + "theoretical_loss": 3.626643654234981, + "tokens_seen": 1067721728 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034165496489468406, + "loss": 2.9036, + "theoretical_loss": 3.6266227266428865, + "tokens_seen": 1067787264 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034164493480441324, + "loss": 2.9121, + "theoretical_loss": 3.6266018006948135, + "tokens_seen": 1067852800 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003416349047141424, + "loss": 2.8495, + "theoretical_loss": 3.6265808763905314, + "tokens_seen": 1067918336 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003416248746238716, + "loss": 2.4696, + "theoretical_loss": 3.62655995372981, + "tokens_seen": 1067983872 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034161484453360083, + "loss": 2.9291, + "theoretical_loss": 3.6265390327124196, + "tokens_seen": 1068049408 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034160481444332996, + "loss": 2.8549, + "theoretical_loss": 3.626518113338131, + "tokens_seen": 1068114944 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415947843530592, + "loss": 2.8996, + "theoretical_loss": 3.6264971956067127, + "tokens_seen": 1068180480 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1714336, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8785064220428467, + "objective/train/theoretical_loss": 3.62649196643053, + "objective/train/tokens_used": 1088656864, + "theoretical_loss": 3.62649196643053, + "tokens_seen": 1068196864 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415847542627884, + "loss": 2.7264, + "theoretical_loss": 3.626476279517936, + "tokens_seen": 1068246016 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034157472417251756, + "loss": 2.7443, + "theoretical_loss": 3.626455365071572, + "tokens_seen": 1068311552 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415646940822468, + "loss": 2.8266, + "theoretical_loss": 3.6264344522673895, + "tokens_seen": 1068377088 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415546639919759, + "loss": 2.79, + "theoretical_loss": 3.6264135411051592, + "tokens_seen": 1068442624 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034154463390170516, + "loss": 2.8318, + "theoretical_loss": 3.626392631584652, + "tokens_seen": 1068508160 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415346038114343, + "loss": 3.1383, + "theoretical_loss": 3.626371723705638, + "tokens_seen": 1068573696 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415245737211635, + "loss": 2.9074, + "theoretical_loss": 3.6263508174678876, + "tokens_seen": 1068639232 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415145436308927, + "loss": 2.9061, + "theoretical_loss": 3.626329912871171, + "tokens_seen": 1068704768 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003415045135406219, + "loss": 2.857, + "theoretical_loss": 3.6263090099152597, + "tokens_seen": 1068770304 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034149448345035106, + "loss": 2.8342, + "theoretical_loss": 3.6262881085999235, + "tokens_seen": 1068835840 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003414844533600803, + "loss": 2.8056, + "theoretical_loss": 3.626267208924933, + "tokens_seen": 1068901376 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003414744232698094, + "loss": 2.7719, + "theoretical_loss": 3.6262463108900596, + "tokens_seen": 1068966912 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034146439317953866, + "loss": 2.9219, + "theoretical_loss": 3.6262254144950736, + "tokens_seen": 1069032448 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003414543630892678, + "loss": 2.6488, + "theoretical_loss": 3.626204519739746, + "tokens_seen": 1069097984 + }, + { + "epoch": 3.01, + "learning_rate": 0.000341444332998997, + "loss": 2.9542, + "theoretical_loss": 3.626183626623847, + "tokens_seen": 1069163520 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003414343029087262, + "loss": 2.9146, + "theoretical_loss": 3.6261627351471484, + "tokens_seen": 1069229056 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003414242728184554, + "loss": 2.8602, + "theoretical_loss": 3.6261418453094207, + "tokens_seen": 1069294592 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034141424272818456, + "loss": 2.7292, + "theoretical_loss": 3.6261209571104347, + "tokens_seen": 1069360128 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034140421263791375, + "loss": 2.7856, + "theoretical_loss": 3.6261000705499624, + "tokens_seen": 1069425664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034139418254764293, + "loss": 2.9475, + "theoretical_loss": 3.6260791856277734, + "tokens_seen": 1069491200 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034138415245737216, + "loss": 2.8518, + "theoretical_loss": 3.6260583023436395, + "tokens_seen": 1069556736 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003413741223671013, + "loss": 2.8341, + "theoretical_loss": 3.6260374206973323, + "tokens_seen": 1069622272 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003413640922768305, + "loss": 2.9939, + "theoretical_loss": 3.6260165406886222, + "tokens_seen": 1069687808 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034135406218655965, + "loss": 2.8169, + "theoretical_loss": 3.6259956623172815, + "tokens_seen": 1069753344 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003413440320962889, + "loss": 2.8531, + "theoretical_loss": 3.6259747855830797, + "tokens_seen": 1069818880 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1717358, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7589900493621826, + "objective/train/theoretical_loss": 3.6259695666553062, + "objective/train/tokens_used": 1090295264, + "theoretical_loss": 3.6259695666553062, + "tokens_seen": 1069835264 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034133400200601807, + "loss": 2.8437, + "theoretical_loss": 3.6259539104857907, + "tokens_seen": 1069884416 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034132397191574725, + "loss": 3.0006, + "theoretical_loss": 3.6259330370251837, + "tokens_seen": 1069949952 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034131394182547643, + "loss": 2.7869, + "theoretical_loss": 3.625912165201031, + "tokens_seen": 1070015488 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034130391173520567, + "loss": 2.8567, + "theoretical_loss": 3.625891295013104, + "tokens_seen": 1070081024 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003412938816449348, + "loss": 2.8289, + "theoretical_loss": 3.6258704264611747, + "tokens_seen": 1070146560 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034128385155466403, + "loss": 2.7604, + "theoretical_loss": 3.6258495595450135, + "tokens_seen": 1070212096 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034127382146439315, + "loss": 2.7769, + "theoretical_loss": 3.625828694264394, + "tokens_seen": 1070277632 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003412637913741224, + "loss": 2.8717, + "theoretical_loss": 3.6258078306190855, + "tokens_seen": 1070343168 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034125376128385157, + "loss": 2.8746, + "theoretical_loss": 3.6257869686088613, + "tokens_seen": 1070408704 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034124373119358075, + "loss": 2.9442, + "theoretical_loss": 3.6257661082334924, + "tokens_seen": 1070474240 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034123370110330993, + "loss": 2.6901, + "theoretical_loss": 3.625745249492751, + "tokens_seen": 1070539776 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003412236710130391, + "loss": 2.8273, + "theoretical_loss": 3.6257243923864095, + "tokens_seen": 1070605312 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003412136409227683, + "loss": 2.8409, + "theoretical_loss": 3.625703536914239, + "tokens_seen": 1070670848 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034120361083249753, + "loss": 2.8691, + "theoretical_loss": 3.625682683076011, + "tokens_seen": 1070736384 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034119358074222666, + "loss": 2.9419, + "theoretical_loss": 3.6256618308714987, + "tokens_seen": 1070801920 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003411835506519559, + "loss": 2.9173, + "theoretical_loss": 3.6256409803004734, + "tokens_seen": 1070867456 + }, + { + "epoch": 3.01, + "learning_rate": 0.000341173520561685, + "loss": 3.0083, + "theoretical_loss": 3.625620131362707, + "tokens_seen": 1070932992 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034116349047141426, + "loss": 2.811, + "theoretical_loss": 3.625599284057973, + "tokens_seen": 1070998528 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034115346038114344, + "loss": 2.8136, + "theoretical_loss": 3.625578438386041, + "tokens_seen": 1071064064 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003411434302908726, + "loss": 2.9647, + "theoretical_loss": 3.6255575943466862, + "tokens_seen": 1071129600 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003411334002006018, + "loss": 2.9011, + "theoretical_loss": 3.625536751939679, + "tokens_seen": 1071195136 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034112337011033103, + "loss": 2.9536, + "theoretical_loss": 3.6255159111647917, + "tokens_seen": 1071260672 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034111334002006016, + "loss": 2.9915, + "theoretical_loss": 3.6254950720217973, + "tokens_seen": 1071326208 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003411033099297894, + "loss": 2.8716, + "theoretical_loss": 3.625474234510468, + "tokens_seen": 1071391744 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003410932798395185, + "loss": 3.0163, + "theoretical_loss": 3.625453398630576, + "tokens_seen": 1071457280 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1720270, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0411574840545654, + "objective/train/theoretical_loss": 3.6254481899154887, + "objective/train/tokens_used": 1091933664, + "theoretical_loss": 3.6254481899154887, + "tokens_seen": 1071473664 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034108324974924776, + "loss": 2.9493, + "theoretical_loss": 3.6254325643818945, + "tokens_seen": 1071522816 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034107321965897694, + "loss": 2.8055, + "theoretical_loss": 3.6254117317641956, + "tokens_seen": 1071588352 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003410631895687061, + "loss": 3.0348, + "theoretical_loss": 3.6253909007772513, + "tokens_seen": 1071653888 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003410531594784353, + "loss": 3.018, + "theoretical_loss": 3.6253700714208357, + "tokens_seen": 1071719424 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003410431293881645, + "loss": 3.0045, + "theoretical_loss": 3.6253492436947194, + "tokens_seen": 1071784960 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034103309929789366, + "loss": 2.8991, + "theoretical_loss": 3.625328417598677, + "tokens_seen": 1071850496 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003410230692076229, + "loss": 2.8103, + "theoretical_loss": 3.6253075931324807, + "tokens_seen": 1071916032 + }, + { + "epoch": 3.01, + "learning_rate": 0.000341013039117352, + "loss": 2.8913, + "theoretical_loss": 3.6252867702959026, + "tokens_seen": 1071981568 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034100300902708126, + "loss": 2.9233, + "theoretical_loss": 3.6252659490887167, + "tokens_seen": 1072047104 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034099297893681044, + "loss": 2.7634, + "theoretical_loss": 3.625245129510695, + "tokens_seen": 1072112640 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003409829488465396, + "loss": 2.8244, + "theoretical_loss": 3.6252243115616114, + "tokens_seen": 1072178176 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003409729187562688, + "loss": 2.7387, + "theoretical_loss": 3.6252034952412373, + "tokens_seen": 1072243712 + }, + { + "epoch": 3.01, + "learning_rate": 0.000340962888665998, + "loss": 3.0682, + "theoretical_loss": 3.6251826805493472, + "tokens_seen": 1072309248 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034095285857572717, + "loss": 2.9728, + "theoretical_loss": 3.625161867485714, + "tokens_seen": 1072374784 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003409428284854564, + "loss": 2.9435, + "theoretical_loss": 3.6251410560501105, + "tokens_seen": 1072440320 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034093279839518553, + "loss": 2.8805, + "theoretical_loss": 3.625120246242309, + "tokens_seen": 1072505856 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034092276830491477, + "loss": 2.8744, + "theoretical_loss": 3.625099438062085, + "tokens_seen": 1072571392 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003409127382146439, + "loss": 2.926, + "theoretical_loss": 3.625078631509209, + "tokens_seen": 1072636928 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034090270812437313, + "loss": 2.8383, + "theoretical_loss": 3.6250578265834568, + "tokens_seen": 1072702464 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003408926780341023, + "loss": 3.0638, + "theoretical_loss": 3.6250370232846, + "tokens_seen": 1072768000 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003408826479438315, + "loss": 2.859, + "theoretical_loss": 3.6250162216124133, + "tokens_seen": 1072833536 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034087261785356067, + "loss": 2.7747, + "theoretical_loss": 3.624995421566669, + "tokens_seen": 1072899072 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034086258776328985, + "loss": 2.7099, + "theoretical_loss": 3.624974623147141, + "tokens_seen": 1072964608 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034085255767301903, + "loss": 2.9371, + "theoretical_loss": 3.624953826353604, + "tokens_seen": 1073030144 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034084252758274827, + "loss": 2.861, + "theoretical_loss": 3.6249330311858294, + "tokens_seen": 1073095680 + }, + { + "epoch": 3.01, + "objective/train/docs_used": 1723064, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6076018810272217, + "objective/train/theoretical_loss": 3.624927832647885, + "objective/train/tokens_used": 1093572064, + "theoretical_loss": 3.624927832647885, + "tokens_seen": 1073112064 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034083249749247745, + "loss": 2.7748, + "theoretical_loss": 3.6249122376435925, + "tokens_seen": 1073161216 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034082246740220663, + "loss": 2.8378, + "theoretical_loss": 3.624891445726666, + "tokens_seen": 1073226752 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034081243731193587, + "loss": 2.8277, + "theoretical_loss": 3.6248706554348247, + "tokens_seen": 1073292288 + }, + { + "epoch": 3.01, + "learning_rate": 0.000340802407221665, + "loss": 2.97, + "theoretical_loss": 3.6248498667678417, + "tokens_seen": 1073357824 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034079237713139423, + "loss": 2.9241, + "theoretical_loss": 3.6248290797254903, + "tokens_seen": 1073423360 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034078234704112335, + "loss": 2.9701, + "theoretical_loss": 3.6248082943075457, + "tokens_seen": 1073488896 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003407723169508526, + "loss": 2.9489, + "theoretical_loss": 3.6247875105137806, + "tokens_seen": 1073554432 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034076228686058177, + "loss": 2.8503, + "theoretical_loss": 3.6247667283439693, + "tokens_seen": 1073619968 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034075225677031095, + "loss": 2.8932, + "theoretical_loss": 3.624745947797886, + "tokens_seen": 1073685504 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034074222668004013, + "loss": 2.9418, + "theoretical_loss": 3.6247251688753046, + "tokens_seen": 1073751040 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003407321965897693, + "loss": 2.932, + "theoretical_loss": 3.6247043915759987, + "tokens_seen": 1073816576 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003407221664994985, + "loss": 2.9952, + "theoretical_loss": 3.6246836158997437, + "tokens_seen": 1073882112 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034071213640922773, + "loss": 2.893, + "theoretical_loss": 3.6246628418463125, + "tokens_seen": 1073947648 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034070210631895686, + "loss": 2.8805, + "theoretical_loss": 3.62464206941548, + "tokens_seen": 1074013184 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003406920762286861, + "loss": 3.0147, + "theoretical_loss": 3.6246212986070203, + "tokens_seen": 1074078720 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003406820461384152, + "loss": 2.9116, + "theoretical_loss": 3.6246005294207073, + "tokens_seen": 1074144256 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034067201604814446, + "loss": 2.9532, + "theoretical_loss": 3.6245797618563165, + "tokens_seen": 1074209792 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034066198595787364, + "loss": 2.8185, + "theoretical_loss": 3.624558995913621, + "tokens_seen": 1074275328 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003406519558676028, + "loss": 2.8448, + "theoretical_loss": 3.6245382315923957, + "tokens_seen": 1074340864 + }, + { + "epoch": 3.01, + "learning_rate": 0.000340641925777332, + "loss": 2.9984, + "theoretical_loss": 3.6245174688924156, + "tokens_seen": 1074406400 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034063189568706123, + "loss": 2.8589, + "theoretical_loss": 3.6244967078134542, + "tokens_seen": 1074471936 + }, + { + "epoch": 3.01, + "learning_rate": 0.00034062186559679036, + "loss": 2.8786, + "theoretical_loss": 3.624475948355287, + "tokens_seen": 1074537472 + }, + { + "epoch": 3.01, + "learning_rate": 0.0003406118355065196, + "loss": 2.9875, + "theoretical_loss": 3.6244551905176885, + "tokens_seen": 1074603008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003406018054162487, + "loss": 2.8655, + "theoretical_loss": 3.624434434300433, + "tokens_seen": 1074668544 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034059177532597796, + "loss": 2.7745, + "theoretical_loss": 3.624413679703295, + "tokens_seen": 1074734080 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1725354, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9782602787017822, + "objective/train/theoretical_loss": 3.624408491307128, + "objective/train/tokens_used": 1095210464, + "theoretical_loss": 3.624408491307128, + "tokens_seen": 1074750464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034058174523570714, + "loss": 3.0207, + "theoretical_loss": 3.6243929267260504, + "tokens_seen": 1074799616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003405717151454363, + "loss": 2.8564, + "theoretical_loss": 3.624372175368473, + "tokens_seen": 1074865152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003405616850551655, + "loss": 2.8749, + "theoretical_loss": 3.624351425630338, + "tokens_seen": 1074930688 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003405516549648947, + "loss": 3.0042, + "theoretical_loss": 3.62433067751142, + "tokens_seen": 1074996224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034054162487462386, + "loss": 2.8527, + "theoretical_loss": 3.624309931011495, + "tokens_seen": 1075061760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003405315947843531, + "loss": 3.0248, + "theoretical_loss": 3.624289186130336, + "tokens_seen": 1075127296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003405215646940822, + "loss": 2.9298, + "theoretical_loss": 3.624268442867719, + "tokens_seen": 1075192832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034051153460381146, + "loss": 2.9663, + "theoretical_loss": 3.62424770122342, + "tokens_seen": 1075258368 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034050150451354064, + "loss": 2.9901, + "theoretical_loss": 3.6242269611972135, + "tokens_seen": 1075323904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003404914744232698, + "loss": 2.9319, + "theoretical_loss": 3.6242062227888745, + "tokens_seen": 1075389440 + }, + { + "epoch": 3.02, + "learning_rate": 0.000340481444332999, + "loss": 2.8986, + "theoretical_loss": 3.624185485998178, + "tokens_seen": 1075454976 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003404714142427282, + "loss": 2.8413, + "theoretical_loss": 3.6241647508248995, + "tokens_seen": 1075520512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034046138415245737, + "loss": 2.8739, + "theoretical_loss": 3.624144017268814, + "tokens_seen": 1075586048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003404513540621866, + "loss": 2.9035, + "theoretical_loss": 3.6241232853296976, + "tokens_seen": 1075651584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034044132397191573, + "loss": 2.8164, + "theoretical_loss": 3.624102555007325, + "tokens_seen": 1075717120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034043129388164497, + "loss": 2.687, + "theoretical_loss": 3.624081826301472, + "tokens_seen": 1075782656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003404212637913741, + "loss": 2.8888, + "theoretical_loss": 3.6240610992119144, + "tokens_seen": 1075848192 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034041123370110333, + "loss": 2.86, + "theoretical_loss": 3.624040373738427, + "tokens_seen": 1075913728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003404012036108325, + "loss": 2.7815, + "theoretical_loss": 3.6240196498807853, + "tokens_seen": 1075979264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003403911735205617, + "loss": 2.8723, + "theoretical_loss": 3.6239989276387656, + "tokens_seen": 1076044800 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034038114343029087, + "loss": 2.9151, + "theoretical_loss": 3.623978207012143, + "tokens_seen": 1076110336 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034037111334002005, + "loss": 2.9885, + "theoretical_loss": 3.6239574880006935, + "tokens_seen": 1076175872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034036108324974923, + "loss": 2.7988, + "theoretical_loss": 3.623936770604193, + "tokens_seen": 1076241408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034035105315947847, + "loss": 2.734, + "theoretical_loss": 3.623916054822417, + "tokens_seen": 1076306944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003403410230692076, + "loss": 2.8872, + "theoretical_loss": 3.623895340655141, + "tokens_seen": 1076372480 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1728064, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.81864857673645, + "objective/train/theoretical_loss": 3.623890162365562, + "objective/train/tokens_used": 1096848864, + "theoretical_loss": 3.623890162365562, + "tokens_seen": 1076388864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034033099297893683, + "loss": 2.7023, + "theoretical_loss": 3.6238746281021417, + "tokens_seen": 1076438016 + }, + { + "epoch": 3.02, + "learning_rate": 0.000340320962888666, + "loss": 2.9485, + "theoretical_loss": 3.6238539171631947, + "tokens_seen": 1076503552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003403109327983952, + "loss": 2.9347, + "theoretical_loss": 3.623833207838076, + "tokens_seen": 1076569088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003403009027081244, + "loss": 2.9753, + "theoretical_loss": 3.623812500126561, + "tokens_seen": 1076634624 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034029087261785356, + "loss": 2.7211, + "theoretical_loss": 3.6237917940284268, + "tokens_seen": 1076700160 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034028084252758274, + "loss": 2.9399, + "theoretical_loss": 3.6237710895434487, + "tokens_seen": 1076765696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034027081243731197, + "loss": 3.0148, + "theoretical_loss": 3.623750386671403, + "tokens_seen": 1076831232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003402607823470411, + "loss": 2.8833, + "theoretical_loss": 3.6237296854120666, + "tokens_seen": 1076896768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034025075225677033, + "loss": 2.8123, + "theoretical_loss": 3.623708985765215, + "tokens_seen": 1076962304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034024072216649946, + "loss": 2.8904, + "theoretical_loss": 3.6236882877306242, + "tokens_seen": 1077027840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003402306920762287, + "loss": 2.7376, + "theoretical_loss": 3.623667591308071, + "tokens_seen": 1077093376 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003402206619859579, + "loss": 2.8588, + "theoretical_loss": 3.6236468964973323, + "tokens_seen": 1077158912 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034021063189568706, + "loss": 2.8688, + "theoretical_loss": 3.623626203298184, + "tokens_seen": 1077224448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034020060180541624, + "loss": 3.022, + "theoretical_loss": 3.6236055117104025, + "tokens_seen": 1077289984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401905717151454, + "loss": 2.9503, + "theoretical_loss": 3.623584821733764, + "tokens_seen": 1077355520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401805416248746, + "loss": 2.8721, + "theoretical_loss": 3.6235641333680455, + "tokens_seen": 1077421056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034017051153460384, + "loss": 2.8529, + "theoretical_loss": 3.6235434466130236, + "tokens_seen": 1077486592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034016048144433296, + "loss": 2.8374, + "theoretical_loss": 3.6235227614684753, + "tokens_seen": 1077552128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401504513540622, + "loss": 2.8859, + "theoretical_loss": 3.6235020779341762, + "tokens_seen": 1077617664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401404212637914, + "loss": 3.0228, + "theoretical_loss": 3.623481396009904, + "tokens_seen": 1077683200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034013039117352056, + "loss": 2.8541, + "theoretical_loss": 3.623460715695435, + "tokens_seen": 1077748736 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034012036108324974, + "loss": 2.8509, + "theoretical_loss": 3.623440036990546, + "tokens_seen": 1077814272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401103309929789, + "loss": 2.9256, + "theoretical_loss": 3.6234193598950144, + "tokens_seen": 1077879808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003401003009027081, + "loss": 2.9642, + "theoretical_loss": 3.623398684408617, + "tokens_seen": 1077945344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034009027081243734, + "loss": 2.9549, + "theoretical_loss": 3.6233780105311304, + "tokens_seen": 1078010880 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1730764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.768656015396118, + "objective/train/theoretical_loss": 3.623372842313125, + "objective/train/tokens_used": 1098487264, + "theoretical_loss": 3.623372842313125, + "tokens_seen": 1078027264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003400802407221665, + "loss": 2.9313, + "theoretical_loss": 3.6233573382623314, + "tokens_seen": 1078076416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003400702106318957, + "loss": 2.9216, + "theoretical_loss": 3.6233366676019974, + "tokens_seen": 1078141952 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003400601805416249, + "loss": 2.9009, + "theoretical_loss": 3.6233159985499057, + "tokens_seen": 1078207488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034005015045135406, + "loss": 2.8978, + "theoretical_loss": 3.6232953311058327, + "tokens_seen": 1078273024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003400401203610833, + "loss": 2.9957, + "theoretical_loss": 3.623274665269556, + "tokens_seen": 1078338560 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003400300902708124, + "loss": 2.8325, + "theoretical_loss": 3.6232540010408534, + "tokens_seen": 1078404096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034002006018054166, + "loss": 2.8855, + "theoretical_loss": 3.6232333384195012, + "tokens_seen": 1078469632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034001003009027084, + "loss": 2.8571, + "theoretical_loss": 3.6232126774052777, + "tokens_seen": 1078535168 + }, + { + "epoch": 3.02, + "learning_rate": 0.00034, + "loss": 2.8603, + "theoretical_loss": 3.623192017997959, + "tokens_seen": 1078600704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003399899699097292, + "loss": 2.7963, + "theoretical_loss": 3.6231713601973237, + "tokens_seen": 1078666240 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003399799398194584, + "loss": 2.8385, + "theoretical_loss": 3.623150704003149, + "tokens_seen": 1078731776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033996990972918757, + "loss": 3.0189, + "theoretical_loss": 3.6231300494152117, + "tokens_seen": 1078797312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003399598796389168, + "loss": 2.9605, + "theoretical_loss": 3.6231093964332897, + "tokens_seen": 1078862848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033994984954864593, + "loss": 3.0011, + "theoretical_loss": 3.623088745057161, + "tokens_seen": 1078928384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033993981945837517, + "loss": 2.8707, + "theoretical_loss": 3.6230680952866026, + "tokens_seen": 1078993920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003399297893681043, + "loss": 3.0068, + "theoretical_loss": 3.623047447121392, + "tokens_seen": 1079059456 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033991975927783353, + "loss": 2.9455, + "theoretical_loss": 3.623026800561308, + "tokens_seen": 1079124992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003399097291875627, + "loss": 2.9775, + "theoretical_loss": 3.6230061556061273, + "tokens_seen": 1079190528 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003398996990972919, + "loss": 2.9445, + "theoretical_loss": 3.622985512255628, + "tokens_seen": 1079256064 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033988966900702107, + "loss": 2.8548, + "theoretical_loss": 3.6229648705095885, + "tokens_seen": 1079321600 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033987963891675025, + "loss": 3.0662, + "theoretical_loss": 3.622944230367786, + "tokens_seen": 1079387136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033986960882647943, + "loss": 2.9544, + "theoretical_loss": 3.6229235918299985, + "tokens_seen": 1079452672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033985957873620867, + "loss": 2.8426, + "theoretical_loss": 3.622902954896005, + "tokens_seen": 1079518208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003398495486459378, + "loss": 2.9125, + "theoretical_loss": 3.6228823195655813, + "tokens_seen": 1079583744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033983951855566703, + "loss": 2.7711, + "theoretical_loss": 3.622861685838507, + "tokens_seen": 1079649280 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1732164, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6899046897888184, + "objective/train/theoretical_loss": 3.6228565276572358, + "objective/train/tokens_used": 1100125664, + "theoretical_loss": 3.6228565276572358, + "tokens_seen": 1079665664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003398294884653962, + "loss": 2.8493, + "theoretical_loss": 3.6228410537145606, + "tokens_seen": 1079714816 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003398194583751254, + "loss": 2.8919, + "theoretical_loss": 3.6228204231935193, + "tokens_seen": 1079780352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003398094282848546, + "loss": 3.0383, + "theoretical_loss": 3.6227997942751617, + "tokens_seen": 1079845888 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033979939819458376, + "loss": 2.8575, + "theoretical_loss": 3.6227791669592664, + "tokens_seen": 1079911424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033978936810431294, + "loss": 2.907, + "theoretical_loss": 3.622758541245611, + "tokens_seen": 1079976960 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033977933801404217, + "loss": 2.8341, + "theoretical_loss": 3.622737917133974, + "tokens_seen": 1080042496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003397693079237713, + "loss": 2.8693, + "theoretical_loss": 3.6227172946241337, + "tokens_seen": 1080108032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033975927783350053, + "loss": 2.8471, + "theoretical_loss": 3.6226966737158692, + "tokens_seen": 1080173568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033974924774322966, + "loss": 2.9, + "theoretical_loss": 3.622676054408958, + "tokens_seen": 1080239104 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003397392176529589, + "loss": 2.8256, + "theoretical_loss": 3.6226554367031794, + "tokens_seen": 1080304640 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003397291875626881, + "loss": 2.9682, + "theoretical_loss": 3.622634820598311, + "tokens_seen": 1080370176 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033971915747241726, + "loss": 2.8138, + "theoretical_loss": 3.6226142060941324, + "tokens_seen": 1080435712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033970912738214644, + "loss": 2.9136, + "theoretical_loss": 3.6225935931904223, + "tokens_seen": 1080501248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396990972918756, + "loss": 2.9358, + "theoretical_loss": 3.6225729818869583, + "tokens_seen": 1080566784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396890672016048, + "loss": 2.8976, + "theoretical_loss": 3.62255237218352, + "tokens_seen": 1080632320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033967903711133404, + "loss": 3.1288, + "theoretical_loss": 3.622531764079886, + "tokens_seen": 1080697856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033966900702106316, + "loss": 2.7905, + "theoretical_loss": 3.6225111575758344, + "tokens_seen": 1080763392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396589769307924, + "loss": 2.8932, + "theoretical_loss": 3.6224905526711453, + "tokens_seen": 1080828928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396489468405216, + "loss": 2.8834, + "theoretical_loss": 3.6224699493655965, + "tokens_seen": 1080894464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033963891675025076, + "loss": 2.9548, + "theoretical_loss": 3.622449347658968, + "tokens_seen": 1080960000 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033962888665997994, + "loss": 2.9506, + "theoretical_loss": 3.622428747551038, + "tokens_seen": 1081025536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396188565697091, + "loss": 2.8722, + "theoretical_loss": 3.6224081490415854, + "tokens_seen": 1081091072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003396088264794383, + "loss": 2.939, + "theoretical_loss": 3.62238755213039, + "tokens_seen": 1081156608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033959879638916754, + "loss": 2.9429, + "theoretical_loss": 3.6223669568172303, + "tokens_seen": 1081222144 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033958876629889667, + "loss": 2.9416, + "theoretical_loss": 3.6223463631018857, + "tokens_seen": 1081287680 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1734795, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8881170749664307, + "objective/train/theoretical_loss": 3.622341214922683, + "objective/train/tokens_used": 1101764064, + "theoretical_loss": 3.622341214922683, + "tokens_seen": 1081304064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003395787362086259, + "loss": 2.8474, + "theoretical_loss": 3.622325770984136, + "tokens_seen": 1081353216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033956870611835503, + "loss": 2.7129, + "theoretical_loss": 3.6223051804637594, + "tokens_seen": 1081418752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033955867602808426, + "loss": 2.8631, + "theoretical_loss": 3.6222845915405353, + "tokens_seen": 1081484288 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033954864593781345, + "loss": 2.8748, + "theoretical_loss": 3.6222640042142435, + "tokens_seen": 1081549824 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003395386158475426, + "loss": 2.7513, + "theoretical_loss": 3.622243418484664, + "tokens_seen": 1081615360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003395285857572718, + "loss": 2.8971, + "theoretical_loss": 3.622222834351575, + "tokens_seen": 1081680896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033951855566700104, + "loss": 2.8565, + "theoretical_loss": 3.6222022518147563, + "tokens_seen": 1081746432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033950852557673017, + "loss": 2.89, + "theoretical_loss": 3.622181670873988, + "tokens_seen": 1081811968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003394984954864594, + "loss": 2.9183, + "theoretical_loss": 3.622161091529049, + "tokens_seen": 1081877504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033948846539618853, + "loss": 2.871, + "theoretical_loss": 3.622140513779719, + "tokens_seen": 1081943040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033947843530591777, + "loss": 2.8594, + "theoretical_loss": 3.6221199376257784, + "tokens_seen": 1082008576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033946840521564695, + "loss": 2.8914, + "theoretical_loss": 3.6220993630670058, + "tokens_seen": 1082074112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033945837512537613, + "loss": 2.6506, + "theoretical_loss": 3.6220787901031812, + "tokens_seen": 1082139648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003394483450351053, + "loss": 2.8597, + "theoretical_loss": 3.622058218734085, + "tokens_seen": 1082205184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003394383149448345, + "loss": 2.7912, + "theoretical_loss": 3.6220376489594965, + "tokens_seen": 1082270720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003394282848545637, + "loss": 2.9643, + "theoretical_loss": 3.6220170807791963, + "tokens_seen": 1082336256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003394182547642929, + "loss": 2.9162, + "theoretical_loss": 3.621996514192963, + "tokens_seen": 1082401792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033940822467402204, + "loss": 3.0903, + "theoretical_loss": 3.6219759492005776, + "tokens_seen": 1082467328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033939819458375127, + "loss": 2.9427, + "theoretical_loss": 3.6219553858018196, + "tokens_seen": 1082532864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393881644934804, + "loss": 3.0213, + "theoretical_loss": 3.621934823996469, + "tokens_seen": 1082598400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033937813440320963, + "loss": 2.9089, + "theoretical_loss": 3.6219142637843063, + "tokens_seen": 1082663936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393681043129388, + "loss": 2.8903, + "theoretical_loss": 3.621893705165111, + "tokens_seen": 1082729472 + }, + { + "epoch": 3.02, + "learning_rate": 0.000339358074222668, + "loss": 2.8193, + "theoretical_loss": 3.6218731481386643, + "tokens_seen": 1082795008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393480441323972, + "loss": 2.7475, + "theoretical_loss": 3.6218525927047454, + "tokens_seen": 1082860544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393380140421264, + "loss": 2.8706, + "theoretical_loss": 3.6218320388631353, + "tokens_seen": 1082926080 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1737651, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0547361373901367, + "objective/train/theoretical_loss": 3.621826900651505, + "objective/train/tokens_used": 1103402464, + "theoretical_loss": 3.621826900651505, + "tokens_seen": 1082942464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393279839518556, + "loss": 3.0518, + "theoretical_loss": 3.6218114866136135, + "tokens_seen": 1082991616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003393179538615848, + "loss": 2.9204, + "theoretical_loss": 3.621790935955961, + "tokens_seen": 1083057152 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033930792377131396, + "loss": 2.7666, + "theoretical_loss": 3.621770386889958, + "tokens_seen": 1083122688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033929789368104314, + "loss": 2.9248, + "theoretical_loss": 3.6217498394153846, + "tokens_seen": 1083188224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033928786359077237, + "loss": 2.9551, + "theoretical_loss": 3.621729293532022, + "tokens_seen": 1083253760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003392778335005015, + "loss": 2.7812, + "theoretical_loss": 3.62170874923965, + "tokens_seen": 1083319296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033926780341023073, + "loss": 2.9405, + "theoretical_loss": 3.62168820653805, + "tokens_seen": 1083384832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033925777331995986, + "loss": 2.9663, + "theoretical_loss": 3.621667665427002, + "tokens_seen": 1083450368 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003392477432296891, + "loss": 3.0001, + "theoretical_loss": 3.621647125906286, + "tokens_seen": 1083515904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003392377131394183, + "loss": 2.9845, + "theoretical_loss": 3.621626587975684, + "tokens_seen": 1083581440 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033922768304914746, + "loss": 2.8395, + "theoretical_loss": 3.621606051634976, + "tokens_seen": 1083646976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033921765295887664, + "loss": 2.9439, + "theoretical_loss": 3.6215855168839433, + "tokens_seen": 1083712512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003392076228686058, + "loss": 2.835, + "theoretical_loss": 3.6215649837223656, + "tokens_seen": 1083778048 + }, + { + "epoch": 3.02, + "learning_rate": 0.000339197592778335, + "loss": 2.7576, + "theoretical_loss": 3.621544452150025, + "tokens_seen": 1083843584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033918756268806424, + "loss": 2.917, + "theoretical_loss": 3.621523922166702, + "tokens_seen": 1083909120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033917753259779336, + "loss": 2.8862, + "theoretical_loss": 3.6215033937721772, + "tokens_seen": 1083974656 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003391675025075226, + "loss": 2.8759, + "theoretical_loss": 3.6214828669662325, + "tokens_seen": 1084040192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003391574724172518, + "loss": 2.9627, + "theoretical_loss": 3.6214623417486482, + "tokens_seen": 1084105728 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033914744232698096, + "loss": 2.9884, + "theoretical_loss": 3.621441818119205, + "tokens_seen": 1084171264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033913741223671014, + "loss": 2.818, + "theoretical_loss": 3.621421296077685, + "tokens_seen": 1084236800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003391273821464393, + "loss": 2.7027, + "theoretical_loss": 3.6214007756238686, + "tokens_seen": 1084302336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003391173520561685, + "loss": 2.9309, + "theoretical_loss": 3.6213802567575377, + "tokens_seen": 1084367872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033910732196589774, + "loss": 2.8113, + "theoretical_loss": 3.621359739478473, + "tokens_seen": 1084433408 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033909729187562687, + "loss": 2.9135, + "theoretical_loss": 3.621339223786456, + "tokens_seen": 1084498944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003390872617853561, + "loss": 2.9364, + "theoretical_loss": 3.621318709681268, + "tokens_seen": 1084564480 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1740671, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.647988796234131, + "objective/train/theoretical_loss": 3.6213135814028874, + "objective/train/tokens_used": 1105040864, + "theoretical_loss": 3.6213135814028874, + "tokens_seen": 1084580864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033907723169508523, + "loss": 2.7775, + "theoretical_loss": 3.6212981971626905, + "tokens_seen": 1084630016 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033906720160481446, + "loss": 2.9079, + "theoretical_loss": 3.621277686230505, + "tokens_seen": 1084695552 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033905717151454365, + "loss": 2.9046, + "theoretical_loss": 3.621257176884493, + "tokens_seen": 1084761088 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033904714142427283, + "loss": 2.8618, + "theoretical_loss": 3.6212366691244355, + "tokens_seen": 1084826624 + }, + { + "epoch": 3.02, + "learning_rate": 0.000339037111334002, + "loss": 2.9768, + "theoretical_loss": 3.6212161629501143, + "tokens_seen": 1084892160 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033902708124373124, + "loss": 2.9114, + "theoretical_loss": 3.6211956583613114, + "tokens_seen": 1084957696 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033901705115346037, + "loss": 3.0145, + "theoretical_loss": 3.621175155357808, + "tokens_seen": 1085023232 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003390070210631896, + "loss": 3.0757, + "theoretical_loss": 3.621154653939386, + "tokens_seen": 1085088768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033899699097291873, + "loss": 2.8255, + "theoretical_loss": 3.6211341541058273, + "tokens_seen": 1085154304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033898696088264797, + "loss": 2.9178, + "theoretical_loss": 3.6211136558569135, + "tokens_seen": 1085219840 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033897693079237715, + "loss": 2.9026, + "theoretical_loss": 3.6210931591924265, + "tokens_seen": 1085285376 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033896690070210633, + "loss": 2.9076, + "theoretical_loss": 3.6210726641121473, + "tokens_seen": 1085350912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003389568706118355, + "loss": 2.8467, + "theoretical_loss": 3.621052170615859, + "tokens_seen": 1085416448 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003389468405215647, + "loss": 2.8214, + "theoretical_loss": 3.6210316787033436, + "tokens_seen": 1085481984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003389368104312939, + "loss": 2.8746, + "theoretical_loss": 3.621011188374382, + "tokens_seen": 1085547520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003389267803410231, + "loss": 2.8563, + "theoretical_loss": 3.6209906996287575, + "tokens_seen": 1085613056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033891675025075224, + "loss": 2.7721, + "theoretical_loss": 3.620970212466251, + "tokens_seen": 1085678592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033890672016048147, + "loss": 2.843, + "theoretical_loss": 3.6209497268866455, + "tokens_seen": 1085744128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003388966900702106, + "loss": 2.9758, + "theoretical_loss": 3.6209292428897224, + "tokens_seen": 1085809664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033888665997993983, + "loss": 2.9408, + "theoretical_loss": 3.620908760475265, + "tokens_seen": 1085875200 + }, + { + "epoch": 3.02, + "learning_rate": 0.000338876629889669, + "loss": 2.9301, + "theoretical_loss": 3.6208882796430544, + "tokens_seen": 1085940736 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003388665997993982, + "loss": 2.7488, + "theoretical_loss": 3.6208678003928734, + "tokens_seen": 1086006272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003388565697091274, + "loss": 2.9519, + "theoretical_loss": 3.6208473227245044, + "tokens_seen": 1086071808 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003388465396188566, + "loss": 2.8658, + "theoretical_loss": 3.62082684663773, + "tokens_seen": 1086137344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033883650952858574, + "loss": 2.8226, + "theoretical_loss": 3.6208063721323316, + "tokens_seen": 1086202880 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1743440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.684760808944702, + "objective/train/theoretical_loss": 3.6208012537530467, + "objective/train/tokens_used": 1106679264, + "theoretical_loss": 3.6208012537530467, + "tokens_seen": 1086219264 + }, + { + "epoch": 3.02, + "learning_rate": 0.000338826479438315, + "loss": 2.8652, + "theoretical_loss": 3.6207858992080926, + "tokens_seen": 1086268416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003388164493480441, + "loss": 2.968, + "theoretical_loss": 3.6207654278647956, + "tokens_seen": 1086333952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033880641925777334, + "loss": 2.8111, + "theoretical_loss": 3.620744958102223, + "tokens_seen": 1086399488 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387963891675025, + "loss": 2.987, + "theoretical_loss": 3.620724489920157, + "tokens_seen": 1086465024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387863590772317, + "loss": 2.7954, + "theoretical_loss": 3.620704023318381, + "tokens_seen": 1086530560 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387763289869609, + "loss": 2.8589, + "theoretical_loss": 3.6206835582966765, + "tokens_seen": 1086596096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033876629889669006, + "loss": 2.8996, + "theoretical_loss": 3.6206630948548275, + "tokens_seen": 1086661632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033875626880641924, + "loss": 3.0111, + "theoretical_loss": 3.620642632992616, + "tokens_seen": 1086727168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387462387161485, + "loss": 2.8418, + "theoretical_loss": 3.6206221727098247, + "tokens_seen": 1086792704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387362086258776, + "loss": 2.9631, + "theoretical_loss": 3.620601714006237, + "tokens_seen": 1086858240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033872617853560684, + "loss": 2.8309, + "theoretical_loss": 3.6205812568816356, + "tokens_seen": 1086923776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033871614844533597, + "loss": 2.8817, + "theoretical_loss": 3.6205608013358033, + "tokens_seen": 1086989312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003387061183550652, + "loss": 2.8189, + "theoretical_loss": 3.620540347368523, + "tokens_seen": 1087054848 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003386960882647944, + "loss": 3.0171, + "theoretical_loss": 3.620519894979578, + "tokens_seen": 1087120384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033868605817452356, + "loss": 2.8959, + "theoretical_loss": 3.6204994441687512, + "tokens_seen": 1087185920 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033867602808425275, + "loss": 2.9761, + "theoretical_loss": 3.620478994935826, + "tokens_seen": 1087251456 + }, + { + "epoch": 3.02, + "learning_rate": 0.000338665997993982, + "loss": 2.9755, + "theoretical_loss": 3.6204585472805855, + "tokens_seen": 1087316992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003386559679037111, + "loss": 2.8903, + "theoretical_loss": 3.6204381012028124, + "tokens_seen": 1087382528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033864593781344034, + "loss": 2.9914, + "theoretical_loss": 3.6204176567022905, + "tokens_seen": 1087448064 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033863590772316947, + "loss": 2.9867, + "theoretical_loss": 3.6203972137788027, + "tokens_seen": 1087513600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003386258776328987, + "loss": 2.7006, + "theoretical_loss": 3.6203767724321327, + "tokens_seen": 1087579136 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003386158475426279, + "loss": 2.8995, + "theoretical_loss": 3.6203563326620634, + "tokens_seen": 1087644672 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033860581745235707, + "loss": 2.927, + "theoretical_loss": 3.6203358944683783, + "tokens_seen": 1087710208 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033859578736208625, + "loss": 2.9255, + "theoretical_loss": 3.620315457850862, + "tokens_seen": 1087775744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033858575727181543, + "loss": 2.8371, + "theoretical_loss": 3.6202950228092963, + "tokens_seen": 1087841280 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1744938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6639416217803955, + "objective/train/theoretical_loss": 3.620289914295122, + "objective/train/tokens_used": 1108317664, + "theoretical_loss": 3.620289914295122, + "tokens_seen": 1087857664 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033857572718154467, + "loss": 2.9041, + "theoretical_loss": 3.620274589343465, + "tokens_seen": 1087906816 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033856569709127385, + "loss": 3.0446, + "theoretical_loss": 3.620254157453153, + "tokens_seen": 1087972352 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033855566700100303, + "loss": 2.8219, + "theoretical_loss": 3.620233727138143, + "tokens_seen": 1088037888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003385456369107322, + "loss": 2.859, + "theoretical_loss": 3.6202132983982183, + "tokens_seen": 1088103424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033853560682046144, + "loss": 2.7953, + "theoretical_loss": 3.6201928712331632, + "tokens_seen": 1088168960 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033852557673019057, + "loss": 2.8467, + "theoretical_loss": 3.6201724456427615, + "tokens_seen": 1088234496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003385155466399198, + "loss": 2.861, + "theoretical_loss": 3.620152021626797, + "tokens_seen": 1088300032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033850551654964893, + "loss": 2.911, + "theoretical_loss": 3.620131599185053, + "tokens_seen": 1088365568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033849548645937817, + "loss": 3.031, + "theoretical_loss": 3.6201111783173134, + "tokens_seen": 1088431104 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033848545636910735, + "loss": 2.7922, + "theoretical_loss": 3.6200907590233635, + "tokens_seen": 1088496640 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033847542627883653, + "loss": 2.9399, + "theoretical_loss": 3.620070341302985, + "tokens_seen": 1088562176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003384653961885657, + "loss": 2.8264, + "theoretical_loss": 3.620049925155964, + "tokens_seen": 1088627712 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003384553660982949, + "loss": 2.849, + "theoretical_loss": 3.620029510582084, + "tokens_seen": 1088693248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003384453360080241, + "loss": 2.913, + "theoretical_loss": 3.620009097581128, + "tokens_seen": 1088758784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003384353059177533, + "loss": 2.9747, + "theoretical_loss": 3.6199886861528814, + "tokens_seen": 1088824320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033842527582748244, + "loss": 2.7678, + "theoretical_loss": 3.6199682762971275, + "tokens_seen": 1088889856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033841524573721167, + "loss": 2.7859, + "theoretical_loss": 3.619947868013651, + "tokens_seen": 1088955392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003384052156469408, + "loss": 2.8015, + "theoretical_loss": 3.6199274613022365, + "tokens_seen": 1089020928 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033839518555667003, + "loss": 2.9436, + "theoretical_loss": 3.619907056162668, + "tokens_seen": 1089086464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383851554663992, + "loss": 2.8015, + "theoretical_loss": 3.6198866525947295, + "tokens_seen": 1089152000 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383751253761284, + "loss": 3.0076, + "theoretical_loss": 3.6198662505982053, + "tokens_seen": 1089217536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383650952858576, + "loss": 2.8246, + "theoretical_loss": 3.6198458501728807, + "tokens_seen": 1089283072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383550651955868, + "loss": 2.7268, + "theoretical_loss": 3.6198254513185395, + "tokens_seen": 1089348608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033834503510531594, + "loss": 2.913, + "theoretical_loss": 3.619805054034966, + "tokens_seen": 1089414144 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383350050150452, + "loss": 2.9054, + "theoretical_loss": 3.6197846583219464, + "tokens_seen": 1089479680 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1747703, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6111607551574707, + "objective/train/theoretical_loss": 3.6197795596390643, + "objective/train/tokens_used": 1109956064, + "theoretical_loss": 3.6197795596390643, + "tokens_seen": 1089496064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383249749247743, + "loss": 2.8431, + "theoretical_loss": 3.619764264179263, + "tokens_seen": 1089545216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033831494483450354, + "loss": 2.7627, + "theoretical_loss": 3.6197438716067016, + "tokens_seen": 1089610752 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003383049147442327, + "loss": 2.8282, + "theoretical_loss": 3.619723480604047, + "tokens_seen": 1089676288 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382948846539619, + "loss": 2.9099, + "theoretical_loss": 3.6197030911710835, + "tokens_seen": 1089741824 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382848545636911, + "loss": 2.9121, + "theoretical_loss": 3.6196827033075967, + "tokens_seen": 1089807360 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033827482447342026, + "loss": 2.8248, + "theoretical_loss": 3.6196623170133706, + "tokens_seen": 1089872896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033826479438314944, + "loss": 2.5903, + "theoretical_loss": 3.6196419322881903, + "tokens_seen": 1089938432 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382547642928787, + "loss": 2.9129, + "theoretical_loss": 3.619621549131841, + "tokens_seen": 1090003968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382447342026078, + "loss": 2.9501, + "theoretical_loss": 3.619601167544107, + "tokens_seen": 1090069504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033823470411233704, + "loss": 2.8947, + "theoretical_loss": 3.6195807875247734, + "tokens_seen": 1090135040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033822467402206617, + "loss": 2.8842, + "theoretical_loss": 3.6195604090736255, + "tokens_seen": 1090200576 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382146439317954, + "loss": 2.7339, + "theoretical_loss": 3.6195400321904487, + "tokens_seen": 1090266112 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003382046138415246, + "loss": 2.7612, + "theoretical_loss": 3.6195196568750276, + "tokens_seen": 1090331648 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033819458375125376, + "loss": 2.8831, + "theoretical_loss": 3.6194992831271477, + "tokens_seen": 1090397184 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033818455366098295, + "loss": 2.6964, + "theoretical_loss": 3.6194789109465937, + "tokens_seen": 1090462720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003381745235707122, + "loss": 2.7965, + "theoretical_loss": 3.6194585403331514, + "tokens_seen": 1090528256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003381644934804413, + "loss": 2.9547, + "theoretical_loss": 3.619438171286606, + "tokens_seen": 1090593792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033815446339017054, + "loss": 2.7624, + "theoretical_loss": 3.6194178038067424, + "tokens_seen": 1090659328 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033814443329989967, + "loss": 2.7669, + "theoretical_loss": 3.6193974378933467, + "tokens_seen": 1090724864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003381344032096289, + "loss": 2.816, + "theoretical_loss": 3.619377073546203, + "tokens_seen": 1090790400 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003381243731193581, + "loss": 3.0361, + "theoretical_loss": 3.6193567107650986, + "tokens_seen": 1090855936 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033811434302908727, + "loss": 2.796, + "theoretical_loss": 3.619336349549817, + "tokens_seen": 1090921472 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033810431293881645, + "loss": 2.9699, + "theoretical_loss": 3.619315989900146, + "tokens_seen": 1090987008 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033809428284854563, + "loss": 2.7505, + "theoretical_loss": 3.619295631815869, + "tokens_seen": 1091052544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003380842527582748, + "loss": 2.8671, + "theoretical_loss": 3.619275275296773, + "tokens_seen": 1091118080 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1750650, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9163548946380615, + "objective/train/theoretical_loss": 3.6192701864115335, + "objective/train/tokens_used": 1111594464, + "theoretical_loss": 3.6192701864115335, + "tokens_seen": 1091134464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033807422266800405, + "loss": 2.9492, + "theoretical_loss": 3.619254920342643, + "tokens_seen": 1091183616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003380641925777332, + "loss": 3.0224, + "theoretical_loss": 3.6192345669532653, + "tokens_seen": 1091249152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003380541624874624, + "loss": 2.9268, + "theoretical_loss": 3.619214215128425, + "tokens_seen": 1091314688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033804413239719154, + "loss": 2.9249, + "theoretical_loss": 3.619193864867908, + "tokens_seen": 1091380224 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033803410230692077, + "loss": 2.9558, + "theoretical_loss": 3.6191735161715006, + "tokens_seen": 1091445760 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033802407221664995, + "loss": 2.7397, + "theoretical_loss": 3.6191531690389884, + "tokens_seen": 1091511296 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033801404212637913, + "loss": 2.8384, + "theoretical_loss": 3.6191328234701574, + "tokens_seen": 1091576832 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003380040120361083, + "loss": 3.0595, + "theoretical_loss": 3.619112479464793, + "tokens_seen": 1091642368 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033799398194583755, + "loss": 2.8708, + "theoretical_loss": 3.6190921370226823, + "tokens_seen": 1091707904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003379839518555667, + "loss": 2.7749, + "theoretical_loss": 3.6190717961436105, + "tokens_seen": 1091773440 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003379739217652959, + "loss": 2.8415, + "theoretical_loss": 3.6190514568273633, + "tokens_seen": 1091838976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033796389167502504, + "loss": 2.8765, + "theoretical_loss": 3.6190311190737283, + "tokens_seen": 1091904512 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003379538615847543, + "loss": 2.9138, + "theoretical_loss": 3.6190107828824907, + "tokens_seen": 1091970048 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033794383149448346, + "loss": 2.9663, + "theoretical_loss": 3.6189904482534363, + "tokens_seen": 1092035584 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033793380140421264, + "loss": 2.982, + "theoretical_loss": 3.6189701151863525, + "tokens_seen": 1092101120 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003379237713139418, + "loss": 3.0055, + "theoretical_loss": 3.6189497836810247, + "tokens_seen": 1092166656 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337913741223671, + "loss": 2.8175, + "theoretical_loss": 3.6189294537372394, + "tokens_seen": 1092232192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003379037111334002, + "loss": 2.8238, + "theoretical_loss": 3.6189091253547834, + "tokens_seen": 1092297728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378936810431294, + "loss": 2.937, + "theoretical_loss": 3.6188887985334426, + "tokens_seen": 1092363264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033788365095285854, + "loss": 2.8708, + "theoretical_loss": 3.618868473273004, + "tokens_seen": 1092428800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378736208625878, + "loss": 2.8771, + "theoretical_loss": 3.618848149573253, + "tokens_seen": 1092494336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378635907723169, + "loss": 2.8354, + "theoretical_loss": 3.618827827433978, + "tokens_seen": 1092559872 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033785356068204614, + "loss": 2.9523, + "theoretical_loss": 3.618807506854964, + "tokens_seen": 1092625408 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378435305917753, + "loss": 2.9241, + "theoretical_loss": 3.6187871878359985, + "tokens_seen": 1092690944 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378335005015045, + "loss": 2.9876, + "theoretical_loss": 3.6187668703768674, + "tokens_seen": 1092756480 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1753040, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0006515979766846, + "objective/train/theoretical_loss": 3.6187617912557837, + "objective/train/tokens_used": 1113232864, + "theoretical_loss": 3.6187617912557837, + "tokens_seen": 1092772864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033782347041123374, + "loss": 2.9618, + "theoretical_loss": 3.618746554477358, + "tokens_seen": 1092822016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378134403209629, + "loss": 2.7891, + "theoretical_loss": 3.6187262401372573, + "tokens_seen": 1092887552 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003378034102306921, + "loss": 2.8106, + "theoretical_loss": 3.618705927356351, + "tokens_seen": 1092953088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003377933801404213, + "loss": 3.0539, + "theoretical_loss": 3.6186856161344276, + "tokens_seen": 1093018624 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033778335005015046, + "loss": 2.9656, + "theoretical_loss": 3.618665306471273, + "tokens_seen": 1093084160 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033777331995987964, + "loss": 2.8662, + "theoretical_loss": 3.6186449983666735, + "tokens_seen": 1093149696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003377632898696089, + "loss": 2.8845, + "theoretical_loss": 3.6186246918204175, + "tokens_seen": 1093215232 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337753259779338, + "loss": 2.9832, + "theoretical_loss": 3.618604386832291, + "tokens_seen": 1093280768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033774322968906724, + "loss": 2.9621, + "theoretical_loss": 3.618584083402081, + "tokens_seen": 1093346304 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033773319959879637, + "loss": 2.7887, + "theoretical_loss": 3.6185637815295753, + "tokens_seen": 1093411840 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003377231695085256, + "loss": 3.042, + "theoretical_loss": 3.6185434812145605, + "tokens_seen": 1093477376 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003377131394182548, + "loss": 3.0517, + "theoretical_loss": 3.6185231824568236, + "tokens_seen": 1093542912 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033770310932798396, + "loss": 2.9754, + "theoretical_loss": 3.6185028852561527, + "tokens_seen": 1093608448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033769307923771315, + "loss": 2.9026, + "theoretical_loss": 3.6184825896123343, + "tokens_seen": 1093673984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003376830491474424, + "loss": 3.0055, + "theoretical_loss": 3.6184622955251564, + "tokens_seen": 1093739520 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003376730190571715, + "loss": 2.9421, + "theoretical_loss": 3.6184420029944055, + "tokens_seen": 1093805056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033766298896690074, + "loss": 2.6667, + "theoretical_loss": 3.6184217120198694, + "tokens_seen": 1093870592 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033765295887662987, + "loss": 3.0634, + "theoretical_loss": 3.6184014226013357, + "tokens_seen": 1093936128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003376429287863591, + "loss": 2.8815, + "theoretical_loss": 3.6183811347385912, + "tokens_seen": 1094001664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003376328986960883, + "loss": 2.8227, + "theoretical_loss": 3.6183608484314242, + "tokens_seen": 1094067200 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033762286860581747, + "loss": 2.8626, + "theoretical_loss": 3.618340563679622, + "tokens_seen": 1094132736 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033761283851554665, + "loss": 2.956, + "theoretical_loss": 3.6183202804829717, + "tokens_seen": 1094198272 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033760280842527583, + "loss": 2.8486, + "theoretical_loss": 3.6182999988412616, + "tokens_seen": 1094263808 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337592778335005, + "loss": 3.0843, + "theoretical_loss": 3.6182797187542795, + "tokens_seen": 1094329344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033758274824473425, + "loss": 2.8624, + "theoretical_loss": 3.618259440221813, + "tokens_seen": 1094394880 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1755697, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.018609046936035, + "objective/train/theoretical_loss": 3.618254370831564, + "objective/train/tokens_used": 1114871264, + "theoretical_loss": 3.618254370831564, + "tokens_seen": 1094411264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003375727181544634, + "loss": 2.9599, + "theoretical_loss": 3.618239163243649, + "tokens_seen": 1094460416 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003375626880641926, + "loss": 2.9534, + "theoretical_loss": 3.618218887819576, + "tokens_seen": 1094525952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033755265797392174, + "loss": 2.8813, + "theoretical_loss": 3.618198613949382, + "tokens_seen": 1094591488 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033754262788365097, + "loss": 2.8611, + "theoretical_loss": 3.618178341632855, + "tokens_seen": 1094657024 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033753259779338015, + "loss": 2.9964, + "theoretical_loss": 3.6181580708697823, + "tokens_seen": 1094722560 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033752256770310933, + "loss": 2.9071, + "theoretical_loss": 3.6181378016599517, + "tokens_seen": 1094788096 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003375125376128385, + "loss": 3.0257, + "theoretical_loss": 3.6181175340031526, + "tokens_seen": 1094853632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033750250752256775, + "loss": 2.8153, + "theoretical_loss": 3.6180972678991714, + "tokens_seen": 1094919168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374924774322969, + "loss": 2.938, + "theoretical_loss": 3.618077003347797, + "tokens_seen": 1094984704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374824473420261, + "loss": 2.9148, + "theoretical_loss": 3.618056740348818, + "tokens_seen": 1095050240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033747241725175524, + "loss": 3.0363, + "theoretical_loss": 3.618036478902022, + "tokens_seen": 1095115776 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374623871614845, + "loss": 2.7695, + "theoretical_loss": 3.618016219007197, + "tokens_seen": 1095181312 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033745235707121366, + "loss": 2.9087, + "theoretical_loss": 3.617995960664132, + "tokens_seen": 1095246848 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033744232698094284, + "loss": 2.899, + "theoretical_loss": 3.617975703872615, + "tokens_seen": 1095312384 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337432296890672, + "loss": 2.7984, + "theoretical_loss": 3.6179554486324332, + "tokens_seen": 1095377920 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374222668004012, + "loss": 2.9578, + "theoretical_loss": 3.6179351949433767, + "tokens_seen": 1095443456 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374122367101304, + "loss": 3.0061, + "theoretical_loss": 3.617914942805233, + "tokens_seen": 1095508992 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003374022066198596, + "loss": 2.8345, + "theoretical_loss": 3.6178946922177913, + "tokens_seen": 1095574528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033739217652958874, + "loss": 2.9149, + "theoretical_loss": 3.617874443180839, + "tokens_seen": 1095640064 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337382146439318, + "loss": 2.9782, + "theoretical_loss": 3.6178541956941657, + "tokens_seen": 1095705600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373721163490471, + "loss": 2.95, + "theoretical_loss": 3.6178339497575593, + "tokens_seen": 1095771136 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033736208625877634, + "loss": 2.9014, + "theoretical_loss": 3.617813705370809, + "tokens_seen": 1095836672 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373520561685055, + "loss": 2.9927, + "theoretical_loss": 3.6177934625337027, + "tokens_seen": 1095902208 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373420260782347, + "loss": 2.8789, + "theoretical_loss": 3.61777322124603, + "tokens_seen": 1095967744 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373319959879639, + "loss": 3.0087, + "theoretical_loss": 3.617752981507579, + "tokens_seen": 1096033280 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1758536, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9226081371307373, + "objective/train/theoretical_loss": 3.617747921815008, + "objective/train/tokens_used": 1116509664, + "theoretical_loss": 3.617747921815008, + "tokens_seen": 1096049664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373219658976931, + "loss": 2.8836, + "theoretical_loss": 3.617732743318139, + "tokens_seen": 1096098816 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033731193580742225, + "loss": 2.7436, + "theoretical_loss": 3.6177125066774987, + "tokens_seen": 1096164352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003373019057171515, + "loss": 3.0114, + "theoretical_loss": 3.6176922715854465, + "tokens_seen": 1096229888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003372918756268806, + "loss": 2.8946, + "theoretical_loss": 3.617672038041772, + "tokens_seen": 1096295424 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033728184553660984, + "loss": 2.838, + "theoretical_loss": 3.617651806046264, + "tokens_seen": 1096360960 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337271815446339, + "loss": 2.9017, + "theoretical_loss": 3.617631575598711, + "tokens_seen": 1096426496 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003372617853560682, + "loss": 2.8339, + "theoretical_loss": 3.6176113466989026, + "tokens_seen": 1096492032 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003372517552657974, + "loss": 2.9261, + "theoretical_loss": 3.6175911193466277, + "tokens_seen": 1096557568 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033724172517552657, + "loss": 2.9156, + "theoretical_loss": 3.617570893541676, + "tokens_seen": 1096623104 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033723169508525575, + "loss": 2.8635, + "theoretical_loss": 3.617550669283836, + "tokens_seen": 1096688640 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337221664994985, + "loss": 2.8976, + "theoretical_loss": 3.6175304465728964, + "tokens_seen": 1096754176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003372116349047141, + "loss": 2.8716, + "theoretical_loss": 3.6175102254086475, + "tokens_seen": 1096819712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033720160481444335, + "loss": 3.0223, + "theoretical_loss": 3.6174900057908785, + "tokens_seen": 1096885248 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371915747241725, + "loss": 2.8693, + "theoretical_loss": 3.617469787719378, + "tokens_seen": 1096950784 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371815446339017, + "loss": 2.7384, + "theoretical_loss": 3.6174495711939363, + "tokens_seen": 1097016320 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371715145436309, + "loss": 2.9046, + "theoretical_loss": 3.617429356214342, + "tokens_seen": 1097081856 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033716148445336007, + "loss": 2.8922, + "theoretical_loss": 3.617409142780385, + "tokens_seen": 1097147392 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033715145436308925, + "loss": 2.9771, + "theoretical_loss": 3.617388930891855, + "tokens_seen": 1097212928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371414242728185, + "loss": 2.8745, + "theoretical_loss": 3.6173687205485408, + "tokens_seen": 1097278464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371313941825476, + "loss": 2.7776, + "theoretical_loss": 3.617348511750233, + "tokens_seen": 1097344000 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033712136409227685, + "loss": 2.9362, + "theoretical_loss": 3.617328304496721, + "tokens_seen": 1097409536 + }, + { + "epoch": 3.02, + "learning_rate": 0.000337111334002006, + "loss": 2.9395, + "theoretical_loss": 3.6173080987877935, + "tokens_seen": 1097475072 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003371013039117352, + "loss": 2.9307, + "theoretical_loss": 3.617287894623241, + "tokens_seen": 1097540608 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003370912738214644, + "loss": 2.9678, + "theoretical_loss": 3.6172676920028533, + "tokens_seen": 1097606144 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003370812437311936, + "loss": 2.9521, + "theoretical_loss": 3.61724749092642, + "tokens_seen": 1097671680 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1761251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9154646396636963, + "objective/train/theoretical_loss": 3.6172424408985293, + "objective/train/tokens_used": 1118148064, + "theoretical_loss": 3.6172424408985293, + "tokens_seen": 1097688064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003370712136409228, + "loss": 2.803, + "theoretical_loss": 3.6172272913937307, + "tokens_seen": 1097737216 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033706118355065194, + "loss": 3.0002, + "theoretical_loss": 3.6172070934045757, + "tokens_seen": 1097802752 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033705115346038117, + "loss": 2.8355, + "theoretical_loss": 3.617186896958745, + "tokens_seen": 1097868288 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033704112337011035, + "loss": 2.9221, + "theoretical_loss": 3.617166702056028, + "tokens_seen": 1097933824 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033703109327983953, + "loss": 2.6146, + "theoretical_loss": 3.617146508696215, + "tokens_seen": 1097999360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003370210631895687, + "loss": 2.8507, + "theoretical_loss": 3.617126316879096, + "tokens_seen": 1098064896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033701103309929795, + "loss": 2.7311, + "theoretical_loss": 3.6171061266044617, + "tokens_seen": 1098130432 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003370010030090271, + "loss": 2.8718, + "theoretical_loss": 3.617085937872101, + "tokens_seen": 1098195968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369909729187563, + "loss": 2.7381, + "theoretical_loss": 3.617065750681806, + "tokens_seen": 1098261504 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033698094282848544, + "loss": 2.8806, + "theoretical_loss": 3.617045565033364, + "tokens_seen": 1098327040 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369709127382147, + "loss": 2.9766, + "theoretical_loss": 3.6170253809265676, + "tokens_seen": 1098392576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033696088264794386, + "loss": 2.8512, + "theoretical_loss": 3.6170051983612064, + "tokens_seen": 1098458112 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033695085255767304, + "loss": 2.9708, + "theoretical_loss": 3.616985017337071, + "tokens_seen": 1098523648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369408224674022, + "loss": 2.9663, + "theoretical_loss": 3.616964837853951, + "tokens_seen": 1098589184 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369307923771314, + "loss": 2.8433, + "theoretical_loss": 3.6169446599116375, + "tokens_seen": 1098654720 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369207622868606, + "loss": 2.8521, + "theoretical_loss": 3.61692448350992, + "tokens_seen": 1098720256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003369107321965898, + "loss": 2.8332, + "theoretical_loss": 3.6169043086485906, + "tokens_seen": 1098785792 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033690070210631894, + "loss": 2.9658, + "theoretical_loss": 3.616884135327439, + "tokens_seen": 1098851328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368906720160482, + "loss": 2.8655, + "theoretical_loss": 3.616863963546255, + "tokens_seen": 1098916864 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368806419257773, + "loss": 2.8108, + "theoretical_loss": 3.6168437933048305, + "tokens_seen": 1098982400 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033687061183550654, + "loss": 2.9216, + "theoretical_loss": 3.6168236246029557, + "tokens_seen": 1099047936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368605817452357, + "loss": 2.8602, + "theoretical_loss": 3.6168034574404206, + "tokens_seen": 1099113472 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368505516549649, + "loss": 2.8543, + "theoretical_loss": 3.616783291817017, + "tokens_seen": 1099179008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368405215646941, + "loss": 2.8727, + "theoretical_loss": 3.6167631277325354, + "tokens_seen": 1099244544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368304914744233, + "loss": 2.9609, + "theoretical_loss": 3.616742965186766, + "tokens_seen": 1099310080 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1764151, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8508760929107666, + "objective/train/theoretical_loss": 3.6167379247907228, + "objective/train/tokens_used": 1119786464, + "theoretical_loss": 3.6167379247907228, + "tokens_seen": 1099326464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033682046138415245, + "loss": 2.7318, + "theoretical_loss": 3.6167228041795, + "tokens_seen": 1099375616 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368104312938817, + "loss": 2.9996, + "theoretical_loss": 3.6167026447105286, + "tokens_seen": 1099441152 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003368004012036108, + "loss": 2.7931, + "theoretical_loss": 3.6166824867796428, + "tokens_seen": 1099506688 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033679037111334004, + "loss": 2.888, + "theoretical_loss": 3.6166623303866334, + "tokens_seen": 1099572224 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003367803410230692, + "loss": 2.8425, + "theoretical_loss": 3.6166421755312905, + "tokens_seen": 1099637760 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003367703109327984, + "loss": 2.9238, + "theoretical_loss": 3.6166220222134067, + "tokens_seen": 1099703296 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003367602808425276, + "loss": 2.8557, + "theoretical_loss": 3.6166018704327723, + "tokens_seen": 1099768832 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033675025075225677, + "loss": 2.9332, + "theoretical_loss": 3.6165817201891786, + "tokens_seen": 1099834368 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033674022066198595, + "loss": 2.7671, + "theoretical_loss": 3.616561571482417, + "tokens_seen": 1099899904 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003367301905717152, + "loss": 2.8432, + "theoretical_loss": 3.6165414243122775, + "tokens_seen": 1099965440 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003367201604814443, + "loss": 2.6676, + "theoretical_loss": 3.6165212786785537, + "tokens_seen": 1100030976 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033671013039117355, + "loss": 2.8293, + "theoretical_loss": 3.6165011345810347, + "tokens_seen": 1100096512 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033670010030090273, + "loss": 2.9422, + "theoretical_loss": 3.616480992019513, + "tokens_seen": 1100162048 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366900702106319, + "loss": 2.8723, + "theoretical_loss": 3.6164608509937795, + "tokens_seen": 1100227584 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366800401203611, + "loss": 2.8711, + "theoretical_loss": 3.6164407115036257, + "tokens_seen": 1100293120 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033667001003009027, + "loss": 2.798, + "theoretical_loss": 3.6164205735488433, + "tokens_seen": 1100358656 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033665997993981945, + "loss": 2.9437, + "theoretical_loss": 3.616400437129224, + "tokens_seen": 1100424192 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366499498495487, + "loss": 2.8409, + "theoretical_loss": 3.6163803022445586, + "tokens_seen": 1100489728 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366399197592778, + "loss": 2.981, + "theoretical_loss": 3.61636016889464, + "tokens_seen": 1100555264 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033662988966900705, + "loss": 3.0115, + "theoretical_loss": 3.616340037079258, + "tokens_seen": 1100620800 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366198595787362, + "loss": 2.9173, + "theoretical_loss": 3.616319906798206, + "tokens_seen": 1100686336 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003366098294884654, + "loss": 2.9215, + "theoretical_loss": 3.6162997780512747, + "tokens_seen": 1100751872 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365997993981946, + "loss": 2.9476, + "theoretical_loss": 3.616279650838256, + "tokens_seen": 1100817408 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365897693079238, + "loss": 2.8523, + "theoretical_loss": 3.6162595251589424, + "tokens_seen": 1100882944 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033657973921765295, + "loss": 3.0015, + "theoretical_loss": 3.6162394010131247, + "tokens_seen": 1100948480 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1766715, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1441173553466797, + "objective/train/theoretical_loss": 3.6162343702162545, + "objective/train/tokens_used": 1121424864, + "theoretical_loss": 3.6162343702162545, + "tokens_seen": 1100964864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033656970912738214, + "loss": 2.8767, + "theoretical_loss": 3.6162192784005955, + "tokens_seen": 1101014016 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365596790371113, + "loss": 2.8846, + "theoretical_loss": 3.6161991573211463, + "tokens_seen": 1101079552 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033654964894684055, + "loss": 2.8733, + "theoretical_loss": 3.6161790377745695, + "tokens_seen": 1101145088 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365396188565697, + "loss": 2.9073, + "theoretical_loss": 3.6161589197606565, + "tokens_seen": 1101210624 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365295887662989, + "loss": 2.8756, + "theoretical_loss": 3.6161388032792, + "tokens_seen": 1101276160 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365195586760281, + "loss": 2.9059, + "theoretical_loss": 3.6161186883299923, + "tokens_seen": 1101341696 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003365095285857573, + "loss": 2.8482, + "theoretical_loss": 3.616098574912824, + "tokens_seen": 1101407232 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033649949849548646, + "loss": 2.9452, + "theoretical_loss": 3.6160784630274887, + "tokens_seen": 1101472768 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033648946840521564, + "loss": 2.9214, + "theoretical_loss": 3.616058352673778, + "tokens_seen": 1101538304 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003364794383149448, + "loss": 2.8809, + "theoretical_loss": 3.616038243851485, + "tokens_seen": 1101603840 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033646940822467406, + "loss": 2.895, + "theoretical_loss": 3.6160181365604007, + "tokens_seen": 1101669376 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003364593781344032, + "loss": 2.7314, + "theoretical_loss": 3.615998030800318, + "tokens_seen": 1101734912 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003364493480441324, + "loss": 2.8617, + "theoretical_loss": 3.6159779265710297, + "tokens_seen": 1101800448 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033643931795386154, + "loss": 2.9632, + "theoretical_loss": 3.6159578238723276, + "tokens_seen": 1101865984 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003364292878635908, + "loss": 2.7552, + "theoretical_loss": 3.615937722704004, + "tokens_seen": 1101931520 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033641925777331996, + "loss": 2.8352, + "theoretical_loss": 3.6159176230658523, + "tokens_seen": 1101997056 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033640922768304914, + "loss": 2.9266, + "theoretical_loss": 3.6158975249576644, + "tokens_seen": 1102062592 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363991975927783, + "loss": 2.9109, + "theoretical_loss": 3.6158774283792328, + "tokens_seen": 1102128128 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363891675025075, + "loss": 2.8219, + "theoretical_loss": 3.61585733333035, + "tokens_seen": 1102193664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363791374122367, + "loss": 3.0249, + "theoretical_loss": 3.6158372398108094, + "tokens_seen": 1102259200 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363691073219659, + "loss": 2.807, + "theoretical_loss": 3.615817147820403, + "tokens_seen": 1102324736 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033635907723169505, + "loss": 2.8749, + "theoretical_loss": 3.6157970573589235, + "tokens_seen": 1102390272 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363490471414243, + "loss": 2.9151, + "theoretical_loss": 3.6157769684261636, + "tokens_seen": 1102455808 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033633901705115346, + "loss": 2.9125, + "theoretical_loss": 3.615756881021917, + "tokens_seen": 1102521344 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033632898696088265, + "loss": 2.8956, + "theoretical_loss": 3.615736795145976, + "tokens_seen": 1102586880 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1768042, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.21588397026062, + "objective/train/theoretical_loss": 3.615731773915764, + "objective/train/tokens_used": 1123063264, + "theoretical_loss": 3.615731773915764, + "tokens_seen": 1102603264 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003363189568706119, + "loss": 3.0184, + "theoretical_loss": 3.615716710798133, + "tokens_seen": 1102652416 + }, + { + "epoch": 3.02, + "learning_rate": 0.000336308926780341, + "loss": 2.8528, + "theoretical_loss": 3.615696627978182, + "tokens_seen": 1102717952 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033629889669007024, + "loss": 2.9016, + "theoretical_loss": 3.615676546685915, + "tokens_seen": 1102783488 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003362888665997994, + "loss": 2.9292, + "theoretical_loss": 3.6156564669211253, + "tokens_seen": 1102849024 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003362788365095286, + "loss": 2.7356, + "theoretical_loss": 3.615636388683606, + "tokens_seen": 1102914560 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003362688064192578, + "loss": 2.9538, + "theoretical_loss": 3.6156163119731506, + "tokens_seen": 1102980096 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033625877632898697, + "loss": 2.9115, + "theoretical_loss": 3.6155962367895516, + "tokens_seen": 1103045632 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033624874623871615, + "loss": 2.8965, + "theoretical_loss": 3.6155761631326024, + "tokens_seen": 1103111168 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003362387161484454, + "loss": 2.8432, + "theoretical_loss": 3.6155560910020963, + "tokens_seen": 1103176704 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003362286860581745, + "loss": 3.0917, + "theoretical_loss": 3.615536020397826, + "tokens_seen": 1103242240 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033621865596790375, + "loss": 2.8686, + "theoretical_loss": 3.615515951319586, + "tokens_seen": 1103307776 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033620862587763293, + "loss": 3.1371, + "theoretical_loss": 3.6154958837671685, + "tokens_seen": 1103373312 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361985957873621, + "loss": 2.8327, + "theoretical_loss": 3.615475817740368, + "tokens_seen": 1103438848 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361885656970913, + "loss": 2.7122, + "theoretical_loss": 3.6154557532389764, + "tokens_seen": 1103504384 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033617853560682047, + "loss": 2.9085, + "theoretical_loss": 3.615435690262788, + "tokens_seen": 1103569920 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033616850551654965, + "loss": 2.8883, + "theoretical_loss": 3.615415628811597, + "tokens_seen": 1103635456 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361584754262789, + "loss": 2.7757, + "theoretical_loss": 3.6153955688851953, + "tokens_seen": 1103700992 + }, + { + "epoch": 3.02, + "learning_rate": 0.000336148445336008, + "loss": 2.9651, + "theoretical_loss": 3.615375510483378, + "tokens_seen": 1103766528 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033613841524573725, + "loss": 2.8084, + "theoretical_loss": 3.615355453605938, + "tokens_seen": 1103832064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361283851554664, + "loss": 2.7591, + "theoretical_loss": 3.615335398252669, + "tokens_seen": 1103897600 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361183550651956, + "loss": 2.9035, + "theoretical_loss": 3.615315344423365, + "tokens_seen": 1103963136 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003361083249749248, + "loss": 2.8281, + "theoretical_loss": 3.6152952921178194, + "tokens_seen": 1104028672 + }, + { + "epoch": 3.02, + "learning_rate": 0.000336098294884654, + "loss": 2.9819, + "theoretical_loss": 3.615275241335826, + "tokens_seen": 1104094208 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033608826479438316, + "loss": 2.9584, + "theoretical_loss": 3.6152551920771785, + "tokens_seen": 1104159744 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033607823470411234, + "loss": 2.8835, + "theoretical_loss": 3.6152351443416713, + "tokens_seen": 1104225280 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1770814, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.181002616882324, + "objective/train/theoretical_loss": 3.615230132645761, + "objective/train/tokens_used": 1124701664, + "theoretical_loss": 3.615230132645761, + "tokens_seen": 1104241664 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003360682046138415, + "loss": 2.9918, + "theoretical_loss": 3.615215098129098, + "tokens_seen": 1104290816 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033605817452357075, + "loss": 2.8727, + "theoretical_loss": 3.615195053439252, + "tokens_seen": 1104356352 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003360481444332999, + "loss": 2.8241, + "theoretical_loss": 3.6151750102719284, + "tokens_seen": 1104421888 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003360381143430291, + "loss": 2.8157, + "theoretical_loss": 3.6151549686269204, + "tokens_seen": 1104487424 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003360280842527583, + "loss": 2.7086, + "theoretical_loss": 3.615134928504022, + "tokens_seen": 1104552960 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003360180541624875, + "loss": 2.8651, + "theoretical_loss": 3.615114889903028, + "tokens_seen": 1104618496 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033600802407221666, + "loss": 2.7602, + "theoretical_loss": 3.6150948528237317, + "tokens_seen": 1104684032 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033599799398194584, + "loss": 2.8355, + "theoretical_loss": 3.6150748172659277, + "tokens_seen": 1104749568 + }, + { + "epoch": 3.02, + "learning_rate": 0.000335987963891675, + "loss": 2.8622, + "theoretical_loss": 3.6150547832294104, + "tokens_seen": 1104815104 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033597793380140426, + "loss": 2.9389, + "theoretical_loss": 3.615034750713974, + "tokens_seen": 1104880640 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003359679037111334, + "loss": 2.8238, + "theoretical_loss": 3.6150147197194125, + "tokens_seen": 1104946176 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003359578736208626, + "loss": 2.7872, + "theoretical_loss": 3.61499469024552, + "tokens_seen": 1105011712 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033594784353059174, + "loss": 2.8715, + "theoretical_loss": 3.6149746622920915, + "tokens_seen": 1105077248 + }, + { + "epoch": 3.02, + "learning_rate": 0.000335937813440321, + "loss": 2.9615, + "theoretical_loss": 3.614954635858921, + "tokens_seen": 1105142784 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033592778335005016, + "loss": 2.9306, + "theoretical_loss": 3.614934610945804, + "tokens_seen": 1105208320 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033591775325977934, + "loss": 2.9166, + "theoretical_loss": 3.6149145875525335, + "tokens_seen": 1105273856 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003359077231695085, + "loss": 2.7833, + "theoretical_loss": 3.6148945656789047, + "tokens_seen": 1105339392 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358976930792377, + "loss": 2.8846, + "theoretical_loss": 3.614874545324712, + "tokens_seen": 1105404928 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358876629889669, + "loss": 2.7462, + "theoretical_loss": 3.6148545264897507, + "tokens_seen": 1105470464 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358776328986961, + "loss": 2.822, + "theoretical_loss": 3.6148345091738143, + "tokens_seen": 1105536000 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033586760280842525, + "loss": 2.9481, + "theoretical_loss": 3.6148144933766986, + "tokens_seen": 1105601536 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358575727181545, + "loss": 2.9197, + "theoretical_loss": 3.6147944790981983, + "tokens_seen": 1105667072 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033584754262788366, + "loss": 2.9546, + "theoretical_loss": 3.6147744663381074, + "tokens_seen": 1105732608 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033583751253761285, + "loss": 2.7521, + "theoretical_loss": 3.6147544550962207, + "tokens_seen": 1105798144 + }, + { + "epoch": 3.02, + "learning_rate": 0.000335827482447342, + "loss": 2.989, + "theoretical_loss": 3.614734445372334, + "tokens_seen": 1105863680 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1773480, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.030334949493408, + "objective/train/theoretical_loss": 3.6147294431785255, + "objective/train/tokens_used": 1126340064, + "theoretical_loss": 3.6147294431785255, + "tokens_seen": 1105880064 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358174523570712, + "loss": 2.9425, + "theoretical_loss": 3.614714437166241, + "tokens_seen": 1105929216 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003358074222668004, + "loss": 3.0148, + "theoretical_loss": 3.614694430477738, + "tokens_seen": 1105994752 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003357973921765296, + "loss": 2.9571, + "theoretical_loss": 3.6146744253066183, + "tokens_seen": 1106060288 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033578736208625875, + "loss": 2.8572, + "theoretical_loss": 3.614654421652679, + "tokens_seen": 1106125824 + }, + { + "epoch": 3.02, + "learning_rate": 0.000335777331995988, + "loss": 2.8927, + "theoretical_loss": 3.6146344195157134, + "tokens_seen": 1106191360 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003357673019057171, + "loss": 2.8819, + "theoretical_loss": 3.6146144188955174, + "tokens_seen": 1106256896 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033575727181544635, + "loss": 2.7224, + "theoretical_loss": 3.614594419791886, + "tokens_seen": 1106322432 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033574724172517553, + "loss": 2.7244, + "theoretical_loss": 3.614574422204614, + "tokens_seen": 1106387968 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003357372116349047, + "loss": 2.9126, + "theoretical_loss": 3.6145544261334974, + "tokens_seen": 1106453504 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003357271815446339, + "loss": 2.8709, + "theoretical_loss": 3.614534431578331, + "tokens_seen": 1106519040 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033571715145436313, + "loss": 2.8621, + "theoretical_loss": 3.61451443853891, + "tokens_seen": 1106584576 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033570712136409225, + "loss": 2.671, + "theoretical_loss": 3.61449444701503, + "tokens_seen": 1106650112 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356970912738215, + "loss": 2.9427, + "theoretical_loss": 3.6144744570064855, + "tokens_seen": 1106715648 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356870611835506, + "loss": 2.8848, + "theoretical_loss": 3.614454468513074, + "tokens_seen": 1106781184 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033567703109327985, + "loss": 2.8938, + "theoretical_loss": 3.6144344815345884, + "tokens_seen": 1106846720 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033566700100300903, + "loss": 2.7585, + "theoretical_loss": 3.614414496070826, + "tokens_seen": 1106912256 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356569709127382, + "loss": 2.7001, + "theoretical_loss": 3.6143945121215815, + "tokens_seen": 1106977792 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356469408224674, + "loss": 2.8992, + "theoretical_loss": 3.6143745296866507, + "tokens_seen": 1107043328 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356369107321966, + "loss": 2.768, + "theoretical_loss": 3.6143545487658297, + "tokens_seen": 1107108864 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033562688064192576, + "loss": 2.8617, + "theoretical_loss": 3.6143345693589133, + "tokens_seen": 1107174400 + }, + { + "epoch": 3.02, + "learning_rate": 0.000335616850551655, + "loss": 2.8906, + "theoretical_loss": 3.614314591465697, + "tokens_seen": 1107239936 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003356068204613841, + "loss": 3.0218, + "theoretical_loss": 3.614294615085978, + "tokens_seen": 1107305472 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033559679037111336, + "loss": 2.8186, + "theoretical_loss": 3.614274640219551, + "tokens_seen": 1107371008 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003355867602808425, + "loss": 2.7441, + "theoretical_loss": 3.614254666866212, + "tokens_seen": 1107436544 + }, + { + "epoch": 3.02, + "learning_rate": 0.0003355767301905717, + "loss": 2.9477, + "theoretical_loss": 3.6142346950257567, + "tokens_seen": 1107502080 + }, + { + "epoch": 3.02, + "objective/train/docs_used": 1776229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0421228408813477, + "objective/train/theoretical_loss": 3.6142297023020067, + "objective/train/tokens_used": 1127978464, + "theoretical_loss": 3.6142297023020067, + "tokens_seen": 1107518464 + }, + { + "epoch": 3.02, + "learning_rate": 0.00033556670010030095, + "loss": 2.797, + "theoretical_loss": 3.614214724697981, + "tokens_seen": 1107567616 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003355566700100301, + "loss": 2.8702, + "theoretical_loss": 3.6141947558826812, + "tokens_seen": 1107633152 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003355466399197593, + "loss": 2.9756, + "theoretical_loss": 3.614174788579653, + "tokens_seen": 1107698688 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003355366098294885, + "loss": 2.7943, + "theoretical_loss": 3.6141548227886924, + "tokens_seen": 1107764224 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003355265797392177, + "loss": 2.7337, + "theoretical_loss": 3.614134858509596, + "tokens_seen": 1107829760 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033551654964894686, + "loss": 2.8571, + "theoretical_loss": 3.6141148957421585, + "tokens_seen": 1107895296 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033550651955867604, + "loss": 2.7704, + "theoretical_loss": 3.6140949344861775, + "tokens_seen": 1107960832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354964894684052, + "loss": 2.7421, + "theoretical_loss": 3.6140749747414485, + "tokens_seen": 1108026368 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033548645937813446, + "loss": 2.8459, + "theoretical_loss": 3.614055016507768, + "tokens_seen": 1108091904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354764292878636, + "loss": 2.9442, + "theoretical_loss": 3.6140350597849316, + "tokens_seen": 1108157440 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354663991975928, + "loss": 3.0058, + "theoretical_loss": 3.6140151045727364, + "tokens_seen": 1108222976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033545636910732195, + "loss": 2.8984, + "theoretical_loss": 3.613995150870978, + "tokens_seen": 1108288512 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354463390170512, + "loss": 2.8062, + "theoretical_loss": 3.6139751986794533, + "tokens_seen": 1108354048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033543630892678036, + "loss": 2.818, + "theoretical_loss": 3.613955247997959, + "tokens_seen": 1108419584 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033542627883650954, + "loss": 2.8931, + "theoretical_loss": 3.6139352988262905, + "tokens_seen": 1108485120 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354162487462387, + "loss": 2.6922, + "theoretical_loss": 3.6139153511642452, + "tokens_seen": 1108550656 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003354062186559679, + "loss": 2.9672, + "theoretical_loss": 3.613895405011619, + "tokens_seen": 1108616192 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353961885656971, + "loss": 2.7878, + "theoretical_loss": 3.6138754603682095, + "tokens_seen": 1108681728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353861584754263, + "loss": 2.8575, + "theoretical_loss": 3.6138555172338114, + "tokens_seen": 1108747264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033537612838515545, + "loss": 3.0125, + "theoretical_loss": 3.613835575608223, + "tokens_seen": 1108812800 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353660982948847, + "loss": 2.9101, + "theoretical_loss": 3.613815635491241, + "tokens_seen": 1108878336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033535606820461386, + "loss": 2.8178, + "theoretical_loss": 3.6137956968826614, + "tokens_seen": 1108943872 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033534603811434305, + "loss": 2.9421, + "theoretical_loss": 3.613775759782281, + "tokens_seen": 1109009408 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353360080240722, + "loss": 2.9746, + "theoretical_loss": 3.6137558241898966, + "tokens_seen": 1109074944 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353259779338014, + "loss": 2.763, + "theoretical_loss": 3.613735890105305, + "tokens_seen": 1109140480 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1779121, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1098923683166504, + "objective/train/theoretical_loss": 3.6137309068197263, + "objective/train/tokens_used": 1129616864, + "theoretical_loss": 3.6137309068197263, + "tokens_seen": 1109156864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353159478435306, + "loss": 2.931, + "theoretical_loss": 3.613715957528304, + "tokens_seen": 1109206016 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003353059177532598, + "loss": 2.8842, + "theoretical_loss": 3.61369602645869, + "tokens_seen": 1109271552 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033529588766298895, + "loss": 2.93, + "theoretical_loss": 3.613676096896259, + "tokens_seen": 1109337088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003352858575727182, + "loss": 2.9687, + "theoretical_loss": 3.6136561688408095, + "tokens_seen": 1109402624 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003352758274824473, + "loss": 2.9137, + "theoretical_loss": 3.613636242292137, + "tokens_seen": 1109468160 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033526579739217655, + "loss": 2.7849, + "theoretical_loss": 3.61361631725004, + "tokens_seen": 1109533696 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033525576730190573, + "loss": 2.7203, + "theoretical_loss": 3.6135963937143147, + "tokens_seen": 1109599232 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003352457372116349, + "loss": 3.0587, + "theoretical_loss": 3.6135764716847585, + "tokens_seen": 1109664768 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003352357071213641, + "loss": 2.9898, + "theoretical_loss": 3.613556551161169, + "tokens_seen": 1109730304 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033522567703109333, + "loss": 2.8694, + "theoretical_loss": 3.613536632143343, + "tokens_seen": 1109795840 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033521564694082245, + "loss": 2.8969, + "theoretical_loss": 3.6135167146310776, + "tokens_seen": 1109861376 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003352056168505517, + "loss": 2.8138, + "theoretical_loss": 3.61349679862417, + "tokens_seen": 1109926912 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351955867602808, + "loss": 2.7271, + "theoretical_loss": 3.6134768841224187, + "tokens_seen": 1109992448 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033518555667001005, + "loss": 2.8009, + "theoretical_loss": 3.61345697112562, + "tokens_seen": 1110057984 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033517552657973923, + "loss": 2.7288, + "theoretical_loss": 3.613437059633572, + "tokens_seen": 1110123520 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351654964894684, + "loss": 2.924, + "theoretical_loss": 3.613417149646071, + "tokens_seen": 1110189056 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351554663991976, + "loss": 2.87, + "theoretical_loss": 3.6133972411629163, + "tokens_seen": 1110254592 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351454363089268, + "loss": 2.9181, + "theoretical_loss": 3.6133773341839035, + "tokens_seen": 1110320128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033513540621865596, + "loss": 2.8886, + "theoretical_loss": 3.6133574287088317, + "tokens_seen": 1110385664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351253761283852, + "loss": 2.807, + "theoretical_loss": 3.613337524737498, + "tokens_seen": 1110451200 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003351153460381143, + "loss": 2.796, + "theoretical_loss": 3.6133176222697, + "tokens_seen": 1110516736 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033510531594784356, + "loss": 2.9411, + "theoretical_loss": 3.6132977213052353, + "tokens_seen": 1110582272 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350952858575727, + "loss": 3.0856, + "theoretical_loss": 3.613277821843902, + "tokens_seen": 1110647808 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350852557673019, + "loss": 2.899, + "theoretical_loss": 3.6132579238854974, + "tokens_seen": 1110713344 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350752256770311, + "loss": 2.9455, + "theoretical_loss": 3.6132380274298197, + "tokens_seen": 1110778880 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1780704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9620349407196045, + "objective/train/theoretical_loss": 3.613233053550678, + "objective/train/tokens_used": 1131255264, + "theoretical_loss": 3.613233053550678, + "tokens_seen": 1110795264 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350651955867603, + "loss": 2.9179, + "theoretical_loss": 3.613218132476667, + "tokens_seen": 1110844416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033505516549648946, + "loss": 2.9763, + "theoretical_loss": 3.613198239025836, + "tokens_seen": 1110909952 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350451354062187, + "loss": 2.8283, + "theoretical_loss": 3.6131783470771266, + "tokens_seen": 1110975488 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350351053159478, + "loss": 2.8196, + "theoretical_loss": 3.6131584566303347, + "tokens_seen": 1111041024 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033502507522567706, + "loss": 2.8957, + "theoretical_loss": 3.61313856768526, + "tokens_seen": 1111106560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350150451354062, + "loss": 2.8883, + "theoretical_loss": 3.6131186802416995, + "tokens_seen": 1111172096 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003350050150451354, + "loss": 3.0186, + "theoretical_loss": 3.613098794299452, + "tokens_seen": 1111237632 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349949849548646, + "loss": 2.7655, + "theoretical_loss": 3.6130789098583147, + "tokens_seen": 1111303168 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349849548645938, + "loss": 2.7273, + "theoretical_loss": 3.6130590269180862, + "tokens_seen": 1111368704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033497492477432296, + "loss": 2.75, + "theoretical_loss": 3.6130391454785658, + "tokens_seen": 1111434240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033496489468405215, + "loss": 2.8409, + "theoretical_loss": 3.61301926553955, + "tokens_seen": 1111499776 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349548645937813, + "loss": 2.7011, + "theoretical_loss": 3.612999387100838, + "tokens_seen": 1111565312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033494483450351056, + "loss": 2.9037, + "theoretical_loss": 3.612979510162228, + "tokens_seen": 1111630848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349348044132397, + "loss": 2.7082, + "theoretical_loss": 3.612959634723519, + "tokens_seen": 1111696384 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349247743229689, + "loss": 2.8434, + "theoretical_loss": 3.612939760784508, + "tokens_seen": 1111761920 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033491474423269805, + "loss": 2.8274, + "theoretical_loss": 3.6129198883449947, + "tokens_seen": 1111827456 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003349047141424273, + "loss": 2.8537, + "theoretical_loss": 3.6129000174047774, + "tokens_seen": 1111892992 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033489468405215647, + "loss": 2.9797, + "theoretical_loss": 3.6128801479636543, + "tokens_seen": 1111958528 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033488465396188565, + "loss": 2.871, + "theoretical_loss": 3.6128602800214233, + "tokens_seen": 1112024064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033487462387161483, + "loss": 2.9444, + "theoretical_loss": 3.612840413577884, + "tokens_seen": 1112089600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033486459378134406, + "loss": 2.6887, + "theoretical_loss": 3.6128205486328353, + "tokens_seen": 1112155136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003348545636910732, + "loss": 2.8469, + "theoretical_loss": 3.612800685186075, + "tokens_seen": 1112220672 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003348445336008024, + "loss": 3.0351, + "theoretical_loss": 3.6127808232374026, + "tokens_seen": 1112286208 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003348345035105316, + "loss": 2.7614, + "theoretical_loss": 3.612760962786616, + "tokens_seen": 1112351744 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003348244734202608, + "loss": 2.9782, + "theoretical_loss": 3.612741103833514, + "tokens_seen": 1112417280 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1783596, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7822399139404297, + "objective/train/theoretical_loss": 3.6127361393292285, + "objective/train/tokens_used": 1132893664, + "theoretical_loss": 3.6127361393292285, + "tokens_seen": 1112433664 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033481444332999, + "loss": 3.0066, + "theoretical_loss": 3.6127212463778964, + "tokens_seen": 1112482816 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033480441323971915, + "loss": 2.8764, + "theoretical_loss": 3.612701390419562, + "tokens_seen": 1112548352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003347943831494484, + "loss": 2.8333, + "theoretical_loss": 3.612681535958308, + "tokens_seen": 1112613888 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003347843530591775, + "loss": 2.9252, + "theoretical_loss": 3.6126616829939353, + "tokens_seen": 1112679424 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033477432296890675, + "loss": 2.8582, + "theoretical_loss": 3.6126418315262425, + "tokens_seen": 1112744960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033476429287863593, + "loss": 2.9392, + "theoretical_loss": 3.6126219815550282, + "tokens_seen": 1112810496 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003347542627883651, + "loss": 2.8844, + "theoretical_loss": 3.6126021330800913, + "tokens_seen": 1112876032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003347442326980943, + "loss": 2.8796, + "theoretical_loss": 3.6125822861012313, + "tokens_seen": 1112941568 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033473420260782353, + "loss": 2.9457, + "theoretical_loss": 3.6125624406182473, + "tokens_seen": 1113007104 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033472417251755265, + "loss": 2.9077, + "theoretical_loss": 3.612542596630938, + "tokens_seen": 1113072640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003347141424272819, + "loss": 2.9343, + "theoretical_loss": 3.6125227541391034, + "tokens_seen": 1113138176 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334704112337011, + "loss": 2.909, + "theoretical_loss": 3.6125029131425426, + "tokens_seen": 1113203712 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033469408224674025, + "loss": 2.7722, + "theoretical_loss": 3.6124830736410543, + "tokens_seen": 1113269248 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033468405215646943, + "loss": 2.9052, + "theoretical_loss": 3.6124632356344386, + "tokens_seen": 1113334784 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003346740220661986, + "loss": 2.7929, + "theoretical_loss": 3.6124433991224945, + "tokens_seen": 1113400320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003346639919759278, + "loss": 2.9714, + "theoretical_loss": 3.612423564105021, + "tokens_seen": 1113465856 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334653961885657, + "loss": 2.9265, + "theoretical_loss": 3.6124037305818186, + "tokens_seen": 1113531392 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033464393179538616, + "loss": 2.7949, + "theoretical_loss": 3.612383898552686, + "tokens_seen": 1113596928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003346339017051154, + "loss": 2.9174, + "theoretical_loss": 3.612364068017423, + "tokens_seen": 1113662464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003346238716148445, + "loss": 2.9412, + "theoretical_loss": 3.612344238975829, + "tokens_seen": 1113728000 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033461384152457376, + "loss": 2.8436, + "theoretical_loss": 3.6123244114277036, + "tokens_seen": 1113793536 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003346038114343029, + "loss": 2.6807, + "theoretical_loss": 3.612304585372847, + "tokens_seen": 1113859072 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345937813440321, + "loss": 2.9549, + "theoretical_loss": 3.612284760811058, + "tokens_seen": 1113924608 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345837512537613, + "loss": 2.824, + "theoretical_loss": 3.612264937742137, + "tokens_seen": 1113990144 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345737211634905, + "loss": 2.8973, + "theoretical_loss": 3.6122451161658837, + "tokens_seen": 1114055680 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1786457, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6653826236724854, + "objective/train/theoretical_loss": 3.612240161005026, + "objective/train/tokens_used": 1134532064, + "theoretical_loss": 3.612240161005026, + "tokens_seen": 1114072064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033456369107321966, + "loss": 2.9092, + "theoretical_loss": 3.612225296082097, + "tokens_seen": 1114121216 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345536609829489, + "loss": 2.9883, + "theoretical_loss": 3.6122054774905785, + "tokens_seen": 1114186752 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334543630892678, + "loss": 2.9069, + "theoretical_loss": 3.6121856603911264, + "tokens_seen": 1114252288 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033453360080240726, + "loss": 2.8163, + "theoretical_loss": 3.612165844783542, + "tokens_seen": 1114317824 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345235707121364, + "loss": 2.802, + "theoretical_loss": 3.6121460306676236, + "tokens_seen": 1114383360 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345135406218656, + "loss": 2.9945, + "theoretical_loss": 3.612126218043173, + "tokens_seen": 1114448896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003345035105315948, + "loss": 2.9277, + "theoretical_loss": 3.612106406909989, + "tokens_seen": 1114514432 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334493480441324, + "loss": 2.8599, + "theoretical_loss": 3.612086597267872, + "tokens_seen": 1114579968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033448345035105316, + "loss": 2.8169, + "theoretical_loss": 3.6120667891166223, + "tokens_seen": 1114645504 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033447342026078235, + "loss": 2.967, + "theoretical_loss": 3.6120469824560395, + "tokens_seen": 1114711040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003344633901705115, + "loss": 2.8637, + "theoretical_loss": 3.612027177285925, + "tokens_seen": 1114776576 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033445336008024076, + "loss": 2.8396, + "theoretical_loss": 3.612007373606078, + "tokens_seen": 1114842112 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003344433299899699, + "loss": 2.9654, + "theoretical_loss": 3.6119875714162983, + "tokens_seen": 1114907648 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003344332998996991, + "loss": 2.8411, + "theoretical_loss": 3.6119677707163875, + "tokens_seen": 1114973184 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033442326980942825, + "loss": 2.8255, + "theoretical_loss": 3.6119479715061455, + "tokens_seen": 1115038720 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003344132397191575, + "loss": 2.7822, + "theoretical_loss": 3.6119281737853726, + "tokens_seen": 1115104256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033440320962888667, + "loss": 2.7855, + "theoretical_loss": 3.611908377553869, + "tokens_seen": 1115169792 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033439317953861585, + "loss": 2.6664, + "theoretical_loss": 3.611888582811435, + "tokens_seen": 1115235328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033438314944834503, + "loss": 2.9817, + "theoretical_loss": 3.6118687895578723, + "tokens_seen": 1115300864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033437311935807427, + "loss": 2.9072, + "theoretical_loss": 3.61184899779298, + "tokens_seen": 1115366400 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003343630892678034, + "loss": 2.9079, + "theoretical_loss": 3.6118292075165592, + "tokens_seen": 1115431936 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033435305917753263, + "loss": 2.8259, + "theoretical_loss": 3.6118094187284107, + "tokens_seen": 1115497472 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033434302908726175, + "loss": 2.928, + "theoretical_loss": 3.6117896314283344, + "tokens_seen": 1115563008 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334332998996991, + "loss": 2.8941, + "theoretical_loss": 3.6117698456161325, + "tokens_seen": 1115628544 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033432296890672017, + "loss": 2.8082, + "theoretical_loss": 3.6117500612916045, + "tokens_seen": 1115694080 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1789480, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7342512607574463, + "objective/train/theoretical_loss": 3.6117451154428983, + "objective/train/tokens_used": 1136170464, + "theoretical_loss": 3.6117451154428983, + "tokens_seen": 1115710464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033431293881644935, + "loss": 2.945, + "theoretical_loss": 3.6117302784545515, + "tokens_seen": 1115759616 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033430290872617853, + "loss": 2.9041, + "theoretical_loss": 3.611710497104774, + "tokens_seen": 1115825152 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342928786359077, + "loss": 2.8842, + "theoretical_loss": 3.6116907172420736, + "tokens_seen": 1115890688 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342828485456369, + "loss": 2.7339, + "theoretical_loss": 3.6116709388662507, + "tokens_seen": 1115956224 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033427281845536613, + "loss": 2.7413, + "theoretical_loss": 3.6116511619771057, + "tokens_seen": 1116021760 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033426278836509526, + "loss": 2.9048, + "theoretical_loss": 3.6116313865744405, + "tokens_seen": 1116087296 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342527582748245, + "loss": 2.9077, + "theoretical_loss": 3.6116116126580557, + "tokens_seen": 1116152832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342427281845536, + "loss": 2.9132, + "theoretical_loss": 3.611591840227752, + "tokens_seen": 1116218368 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033423269809428285, + "loss": 2.7715, + "theoretical_loss": 3.6115720692833313, + "tokens_seen": 1116283904 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033422266800401204, + "loss": 2.9294, + "theoretical_loss": 3.611552299824594, + "tokens_seen": 1116349440 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342126379137412, + "loss": 2.9602, + "theoretical_loss": 3.611532531851341, + "tokens_seen": 1116414976 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003342026078234704, + "loss": 2.9651, + "theoretical_loss": 3.6115127653633747, + "tokens_seen": 1116480512 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033419257773319963, + "loss": 2.9657, + "theoretical_loss": 3.6114930003604955, + "tokens_seen": 1116546048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033418254764292876, + "loss": 2.947, + "theoretical_loss": 3.611473236842505, + "tokens_seen": 1116611584 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334172517552658, + "loss": 2.799, + "theoretical_loss": 3.6114534748092035, + "tokens_seen": 1116677120 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003341624874623871, + "loss": 2.8483, + "theoretical_loss": 3.611433714260393, + "tokens_seen": 1116742656 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033415245737211636, + "loss": 2.781, + "theoretical_loss": 3.611413955195876, + "tokens_seen": 1116808192 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033414242728184554, + "loss": 2.9614, + "theoretical_loss": 3.6113941976154518, + "tokens_seen": 1116873728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003341323971915747, + "loss": 2.8284, + "theoretical_loss": 3.6113744415189237, + "tokens_seen": 1116939264 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003341223671013039, + "loss": 2.882, + "theoretical_loss": 3.6113546869060924, + "tokens_seen": 1117004800 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003341123370110331, + "loss": 2.8294, + "theoretical_loss": 3.6113349337767593, + "tokens_seen": 1117070336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033410230692076226, + "loss": 2.8092, + "theoretical_loss": 3.611315182130726, + "tokens_seen": 1117135872 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340922768304915, + "loss": 2.9001, + "theoretical_loss": 3.611295431967794, + "tokens_seen": 1117201408 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340822467402207, + "loss": 2.8214, + "theoretical_loss": 3.611275683287766, + "tokens_seen": 1117266944 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033407221664994986, + "loss": 2.8916, + "theoretical_loss": 3.6112559360904424, + "tokens_seen": 1117332480 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1792364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.629387140274048, + "objective/train/theoretical_loss": 3.6112509995227606, + "objective/train/tokens_used": 1137808864, + "theoretical_loss": 3.6112509995227606, + "tokens_seen": 1117348864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340621865596791, + "loss": 2.832, + "theoretical_loss": 3.6112361903756254, + "tokens_seen": 1117398016 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340521564694082, + "loss": 2.7342, + "theoretical_loss": 3.611216446143117, + "tokens_seen": 1117463552 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033404212637913746, + "loss": 2.8177, + "theoretical_loss": 3.611196703392719, + "tokens_seen": 1117529088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340320962888666, + "loss": 2.9564, + "theoretical_loss": 3.6111769621242322, + "tokens_seen": 1117594624 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340220661985958, + "loss": 2.8278, + "theoretical_loss": 3.61115722233746, + "tokens_seen": 1117660160 + }, + { + "epoch": 3.03, + "learning_rate": 0.000334012036108325, + "loss": 2.8787, + "theoretical_loss": 3.611137484032203, + "tokens_seen": 1117725696 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003340020060180542, + "loss": 2.9201, + "theoretical_loss": 3.6111177472082643, + "tokens_seen": 1117791232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033399197592778336, + "loss": 2.8666, + "theoretical_loss": 3.611098011865445, + "tokens_seen": 1117856768 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033398194583751255, + "loss": 2.8353, + "theoretical_loss": 3.611078278003548, + "tokens_seen": 1117922304 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003339719157472417, + "loss": 2.8987, + "theoretical_loss": 3.6110585456223747, + "tokens_seen": 1117987840 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033396188565697096, + "loss": 2.7892, + "theoretical_loss": 3.6110388147217267, + "tokens_seen": 1118053376 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003339518555667001, + "loss": 2.8969, + "theoretical_loss": 3.6110190853014075, + "tokens_seen": 1118118912 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003339418254764293, + "loss": 2.9213, + "theoretical_loss": 3.610999357361218, + "tokens_seen": 1118184448 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033393179538615845, + "loss": 2.8435, + "theoretical_loss": 3.6109796309009616, + "tokens_seen": 1118249984 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003339217652958877, + "loss": 3.0166, + "theoretical_loss": 3.6109599059204394, + "tokens_seen": 1118315520 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033391173520561687, + "loss": 2.909, + "theoretical_loss": 3.610940182419455, + "tokens_seen": 1118381056 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033390170511534605, + "loss": 2.6954, + "theoretical_loss": 3.610920460397809, + "tokens_seen": 1118446592 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033389167502507523, + "loss": 2.9903, + "theoretical_loss": 3.610900739855305, + "tokens_seen": 1118512128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033388164493480447, + "loss": 2.7519, + "theoretical_loss": 3.6108810207917457, + "tokens_seen": 1118577664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003338716148445336, + "loss": 2.8505, + "theoretical_loss": 3.6108613032069328, + "tokens_seen": 1118643200 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033386158475426283, + "loss": 2.9513, + "theoretical_loss": 3.6108415871006687, + "tokens_seen": 1118708736 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033385155466399195, + "loss": 2.8042, + "theoretical_loss": 3.6108218724727568, + "tokens_seen": 1118774272 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003338415245737212, + "loss": 2.9453, + "theoretical_loss": 3.6108021593229984, + "tokens_seen": 1118839808 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033383149448345037, + "loss": 2.9104, + "theoretical_loss": 3.610782447651197, + "tokens_seen": 1118905344 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033382146439317955, + "loss": 2.8503, + "theoretical_loss": 3.610762737457155, + "tokens_seen": 1118970880 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1794356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9079301357269287, + "objective/train/theoretical_loss": 3.6107578101395212, + "objective/train/tokens_used": 1139447264, + "theoretical_loss": 3.6107578101395212, + "tokens_seen": 1118987264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033381143430290873, + "loss": 2.8593, + "theoretical_loss": 3.610743028740675, + "tokens_seen": 1119036416 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003338014042126379, + "loss": 2.8352, + "theoretical_loss": 3.6107233215015597, + "tokens_seen": 1119101952 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003337913741223671, + "loss": 2.7895, + "theoretical_loss": 3.610703615739612, + "tokens_seen": 1119167488 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033378134403209633, + "loss": 2.8582, + "theoretical_loss": 3.610683911454635, + "tokens_seen": 1119233024 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033377131394182546, + "loss": 2.9454, + "theoretical_loss": 3.6106642086464316, + "tokens_seen": 1119298560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003337612838515547, + "loss": 2.8187, + "theoretical_loss": 3.6106445073148032, + "tokens_seen": 1119364096 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003337512537612838, + "loss": 2.8418, + "theoretical_loss": 3.610624807459554, + "tokens_seen": 1119429632 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033374122367101306, + "loss": 2.9549, + "theoretical_loss": 3.6106051090804865, + "tokens_seen": 1119495168 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033373119358074224, + "loss": 3.0414, + "theoretical_loss": 3.6105854121774046, + "tokens_seen": 1119560704 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003337211634904714, + "loss": 2.8803, + "theoretical_loss": 3.61056571675011, + "tokens_seen": 1119626240 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003337111334002006, + "loss": 2.9307, + "theoretical_loss": 3.610546022798406, + "tokens_seen": 1119691776 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033370110330992983, + "loss": 2.9512, + "theoretical_loss": 3.6105263303220965, + "tokens_seen": 1119757312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033369107321965896, + "loss": 2.731, + "theoretical_loss": 3.610506639320984, + "tokens_seen": 1119822848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336810431293882, + "loss": 3.0321, + "theoretical_loss": 3.610486949794872, + "tokens_seen": 1119888384 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336710130391173, + "loss": 2.7499, + "theoretical_loss": 3.6104672617435627, + "tokens_seen": 1119953920 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033366098294884656, + "loss": 2.9271, + "theoretical_loss": 3.610447575166861, + "tokens_seen": 1120019456 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033365095285857574, + "loss": 2.8536, + "theoretical_loss": 3.610427890064569, + "tokens_seen": 1120084992 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336409227683049, + "loss": 2.9227, + "theoretical_loss": 3.6104082064364906, + "tokens_seen": 1120150528 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336308926780341, + "loss": 3.0513, + "theoretical_loss": 3.6103885242824285, + "tokens_seen": 1120216064 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336208625877633, + "loss": 2.7689, + "theoretical_loss": 3.6103688436021866, + "tokens_seen": 1120281600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033361083249749246, + "loss": 2.913, + "theoretical_loss": 3.6103491643955685, + "tokens_seen": 1120347136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003336008024072217, + "loss": 2.9015, + "theoretical_loss": 3.610329486662377, + "tokens_seen": 1120412672 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003335907723169508, + "loss": 3.0031, + "theoretical_loss": 3.6103098104024163, + "tokens_seen": 1120478208 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033358074222668006, + "loss": 2.8134, + "theoretical_loss": 3.6102901356154895, + "tokens_seen": 1120543744 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033357071213640924, + "loss": 2.7472, + "theoretical_loss": 3.6102704623014, + "tokens_seen": 1120609280 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1797089, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.890531063079834, + "objective/train/theoretical_loss": 3.610265544202986, + "objective/train/tokens_used": 1141085664, + "theoretical_loss": 3.610265544202986, + "tokens_seen": 1120625664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003335606820461384, + "loss": 2.8098, + "theoretical_loss": 3.610250790459952, + "tokens_seen": 1120674816 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003335506519558676, + "loss": 2.7085, + "theoretical_loss": 3.610231120090949, + "tokens_seen": 1120740352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003335406218655968, + "loss": 2.7278, + "theoretical_loss": 3.6102114511941945, + "tokens_seen": 1120805888 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033353059177532597, + "loss": 2.831, + "theoretical_loss": 3.6101917837694923, + "tokens_seen": 1120871424 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003335205616850552, + "loss": 2.8449, + "theoretical_loss": 3.610172117816646, + "tokens_seen": 1120936960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033351053159478433, + "loss": 2.8019, + "theoretical_loss": 3.61015245333546, + "tokens_seen": 1121002496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033350050150451356, + "loss": 2.7992, + "theoretical_loss": 3.6101327903257374, + "tokens_seen": 1121068032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003334904714142427, + "loss": 2.9347, + "theoretical_loss": 3.6101131287872823, + "tokens_seen": 1121133568 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003334804413239719, + "loss": 2.752, + "theoretical_loss": 3.6100934687198993, + "tokens_seen": 1121199104 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003334704112337011, + "loss": 2.8141, + "theoretical_loss": 3.6100738101233913, + "tokens_seen": 1121264640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003334603811434303, + "loss": 2.8665, + "theoretical_loss": 3.610054152997563, + "tokens_seen": 1121330176 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033345035105315947, + "loss": 2.8776, + "theoretical_loss": 3.6100344973422183, + "tokens_seen": 1121395712 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033344032096288865, + "loss": 2.7751, + "theoretical_loss": 3.610014843157161, + "tokens_seen": 1121461248 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033343029087261783, + "loss": 2.9898, + "theoretical_loss": 3.609995190442196, + "tokens_seen": 1121526784 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033342026078234707, + "loss": 2.8723, + "theoretical_loss": 3.609975539197126, + "tokens_seen": 1121592320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003334102306920762, + "loss": 2.7833, + "theoretical_loss": 3.609955889421756, + "tokens_seen": 1121657856 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033340020060180543, + "loss": 2.9392, + "theoretical_loss": 3.609936241115891, + "tokens_seen": 1121723392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003333901705115346, + "loss": 2.7724, + "theoretical_loss": 3.6099165942793343, + "tokens_seen": 1121788928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003333801404212638, + "loss": 2.8442, + "theoretical_loss": 3.6098969489118904, + "tokens_seen": 1121854464 + }, + { + "epoch": 3.03, + "learning_rate": 0.000333370110330993, + "loss": 2.936, + "theoretical_loss": 3.6098773050133635, + "tokens_seen": 1121920000 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033336008024072215, + "loss": 2.7789, + "theoretical_loss": 3.6098576625835577, + "tokens_seen": 1121985536 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033335005015045134, + "loss": 2.9033, + "theoretical_loss": 3.6098380216222785, + "tokens_seen": 1122051072 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033334002006018057, + "loss": 2.7979, + "theoretical_loss": 3.6098183821293297, + "tokens_seen": 1122116608 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033332998996990975, + "loss": 2.786, + "theoretical_loss": 3.6097987441045154, + "tokens_seen": 1122182144 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033331995987963893, + "loss": 2.7927, + "theoretical_loss": 3.6097791075476406, + "tokens_seen": 1122247680 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1799759, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5882740020751953, + "objective/train/theoretical_loss": 3.6097741986377647, + "objective/train/tokens_used": 1142724064, + "theoretical_loss": 3.6097741986377647, + "tokens_seen": 1122264064 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003333099297893681, + "loss": 2.7901, + "theoretical_loss": 3.60975947245851, + "tokens_seen": 1122313216 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003332998996990973, + "loss": 2.9089, + "theoretical_loss": 3.6097398388369273, + "tokens_seen": 1122378752 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033328986960882653, + "loss": 2.9283, + "theoretical_loss": 3.609720206682698, + "tokens_seen": 1122444288 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033327983951855566, + "loss": 2.8006, + "theoretical_loss": 3.6097005759956264, + "tokens_seen": 1122509824 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003332698094282849, + "loss": 2.8824, + "theoretical_loss": 3.6096809467755175, + "tokens_seen": 1122575360 + }, + { + "epoch": 3.03, + "learning_rate": 0.000333259779338014, + "loss": 2.898, + "theoretical_loss": 3.6096613190221762, + "tokens_seen": 1122640896 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033324974924774326, + "loss": 2.9439, + "theoretical_loss": 3.6096416927354067, + "tokens_seen": 1122706432 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033323971915747244, + "loss": 2.8191, + "theoretical_loss": 3.609622067915014, + "tokens_seen": 1122771968 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003332296890672016, + "loss": 2.8414, + "theoretical_loss": 3.609602444560803, + "tokens_seen": 1122837504 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003332196589769308, + "loss": 2.8608, + "theoretical_loss": 3.609582822672579, + "tokens_seen": 1122903040 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033320962888666003, + "loss": 2.7959, + "theoretical_loss": 3.609563202250146, + "tokens_seen": 1122968576 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033319959879638916, + "loss": 2.9646, + "theoretical_loss": 3.60954358329331, + "tokens_seen": 1123034112 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331895687061184, + "loss": 2.9569, + "theoretical_loss": 3.6095239658018756, + "tokens_seen": 1123099648 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331795386158475, + "loss": 2.8244, + "theoretical_loss": 3.6095043497756474, + "tokens_seen": 1123165184 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033316950852557676, + "loss": 2.9049, + "theoretical_loss": 3.6094847352144313, + "tokens_seen": 1123230720 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033315947843530594, + "loss": 2.8275, + "theoretical_loss": 3.6094651221180314, + "tokens_seen": 1123296256 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331494483450351, + "loss": 2.8528, + "theoretical_loss": 3.609445510486254, + "tokens_seen": 1123361792 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331394182547643, + "loss": 2.6948, + "theoretical_loss": 3.6094259003189038, + "tokens_seen": 1123427328 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331293881644935, + "loss": 2.8052, + "theoretical_loss": 3.6094062916157856, + "tokens_seen": 1123492864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033311935807422266, + "loss": 3.0557, + "theoretical_loss": 3.609386684376705, + "tokens_seen": 1123558400 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003331093279839519, + "loss": 2.9465, + "theoretical_loss": 3.6093670786014673, + "tokens_seen": 1123623936 + }, + { + "epoch": 3.03, + "learning_rate": 0.000333099297893681, + "loss": 2.9013, + "theoretical_loss": 3.609347474289878, + "tokens_seen": 1123689472 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033308926780341026, + "loss": 2.8012, + "theoretical_loss": 3.6093278714417423, + "tokens_seen": 1123755008 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033307923771313944, + "loss": 2.9791, + "theoretical_loss": 3.609308270056866, + "tokens_seen": 1123820544 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003330692076228686, + "loss": 2.6917, + "theoretical_loss": 3.6092886701350535, + "tokens_seen": 1123886080 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1802618, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0263853073120117, + "objective/train/theoretical_loss": 3.6092837703831817, + "objective/train/tokens_used": 1144362464, + "theoretical_loss": 3.6092837703831817, + "tokens_seen": 1123902464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003330591775325978, + "loss": 2.869, + "theoretical_loss": 3.6092690716761116, + "tokens_seen": 1123951616 + }, + { + "epoch": 3.03, + "learning_rate": 0.000333049147442327, + "loss": 2.928, + "theoretical_loss": 3.6092494746798454, + "tokens_seen": 1124017152 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033303911735205617, + "loss": 2.742, + "theoretical_loss": 3.60922987914606, + "tokens_seen": 1124082688 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003330290872617854, + "loss": 2.8673, + "theoretical_loss": 3.6092102850745613, + "tokens_seen": 1124148224 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033301905717151453, + "loss": 2.7522, + "theoretical_loss": 3.6091906924651553, + "tokens_seen": 1124213760 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033300902708124376, + "loss": 2.9123, + "theoretical_loss": 3.6091711013176466, + "tokens_seen": 1124279296 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003329989969909729, + "loss": 2.9104, + "theoretical_loss": 3.6091515116318424, + "tokens_seen": 1124344832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003329889669007021, + "loss": 2.6848, + "theoretical_loss": 3.6091319234075474, + "tokens_seen": 1124410368 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003329789368104313, + "loss": 2.9216, + "theoretical_loss": 3.609112336644568, + "tokens_seen": 1124475904 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003329689067201605, + "loss": 2.9691, + "theoretical_loss": 3.6090927513427093, + "tokens_seen": 1124541440 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033295887662988967, + "loss": 2.8765, + "theoretical_loss": 3.6090731675017773, + "tokens_seen": 1124606976 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033294884653961885, + "loss": 2.8992, + "theoretical_loss": 3.609053585121579, + "tokens_seen": 1124672512 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033293881644934803, + "loss": 2.7087, + "theoretical_loss": 3.609034004201919, + "tokens_seen": 1124738048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033292878635907727, + "loss": 2.7366, + "theoretical_loss": 3.609014424742604, + "tokens_seen": 1124803584 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003329187562688064, + "loss": 2.9137, + "theoretical_loss": 3.6089948467434394, + "tokens_seen": 1124869120 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033290872617853563, + "loss": 2.7744, + "theoretical_loss": 3.608975270204232, + "tokens_seen": 1124934656 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003328986960882648, + "loss": 2.9206, + "theoretical_loss": 3.6089556951247874, + "tokens_seen": 1125000192 + }, + { + "epoch": 3.03, + "learning_rate": 0.000332888665997994, + "loss": 2.9041, + "theoretical_loss": 3.608936121504912, + "tokens_seen": 1125065728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003328786359077232, + "loss": 2.8937, + "theoretical_loss": 3.6089165493444115, + "tokens_seen": 1125131264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033286860581745235, + "loss": 2.849, + "theoretical_loss": 3.6088969786430924, + "tokens_seen": 1125196800 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033285857572718154, + "loss": 2.831, + "theoretical_loss": 3.6088774094007614, + "tokens_seen": 1125262336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033284854563691077, + "loss": 2.733, + "theoretical_loss": 3.6088578416172243, + "tokens_seen": 1125327872 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003328385155466399, + "loss": 2.7617, + "theoretical_loss": 3.608838275292287, + "tokens_seen": 1125393408 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033282848545636913, + "loss": 2.89, + "theoretical_loss": 3.6088187104257563, + "tokens_seen": 1125458944 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033281845536609826, + "loss": 2.9518, + "theoretical_loss": 3.6087991470174385, + "tokens_seen": 1125524480 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1803910, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828958511352539, + "objective/train/theoretical_loss": 3.6087942563931823, + "objective/train/tokens_used": 1146000864, + "theoretical_loss": 3.6087942563931823, + "tokens_seen": 1125540864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003328084252758275, + "loss": 3.0793, + "theoretical_loss": 3.6087795850671402, + "tokens_seen": 1125590016 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003327983951855567, + "loss": 2.9092, + "theoretical_loss": 3.608760024574668, + "tokens_seen": 1125655552 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033278836509528586, + "loss": 2.7611, + "theoretical_loss": 3.6087404655398276, + "tokens_seen": 1125721088 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033277833500501504, + "loss": 2.9015, + "theoretical_loss": 3.6087209079624265, + "tokens_seen": 1125786624 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003327683049147442, + "loss": 2.8441, + "theoretical_loss": 3.608701351842271, + "tokens_seen": 1125852160 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003327582748244734, + "loss": 2.9039, + "theoretical_loss": 3.6086817971791665, + "tokens_seen": 1125917696 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033274824473420264, + "loss": 2.8358, + "theoretical_loss": 3.6086622439729212, + "tokens_seen": 1125983232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033273821464393176, + "loss": 2.7906, + "theoretical_loss": 3.6086426922233414, + "tokens_seen": 1126048768 + }, + { + "epoch": 3.03, + "learning_rate": 0.000332728184553661, + "loss": 2.9192, + "theoretical_loss": 3.6086231419302335, + "tokens_seen": 1126114304 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003327181544633902, + "loss": 2.8976, + "theoretical_loss": 3.608603593093404, + "tokens_seen": 1126179840 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033270812437311936, + "loss": 2.7811, + "theoretical_loss": 3.608584045712661, + "tokens_seen": 1126245376 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033269809428284854, + "loss": 2.9761, + "theoretical_loss": 3.6085644997878097, + "tokens_seen": 1126310912 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003326880641925777, + "loss": 2.9901, + "theoretical_loss": 3.608544955318658, + "tokens_seen": 1126376448 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003326780341023069, + "loss": 2.8957, + "theoretical_loss": 3.608525412305012, + "tokens_seen": 1126441984 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033266800401203614, + "loss": 2.8658, + "theoretical_loss": 3.6085058707466793, + "tokens_seen": 1126507520 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033265797392176527, + "loss": 2.7122, + "theoretical_loss": 3.608486330643467, + "tokens_seen": 1126573056 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003326479438314945, + "loss": 2.812, + "theoretical_loss": 3.6084667919951814, + "tokens_seen": 1126638592 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033263791374122363, + "loss": 2.984, + "theoretical_loss": 3.60844725480163, + "tokens_seen": 1126704128 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033262788365095286, + "loss": 2.8412, + "theoretical_loss": 3.60842771906262, + "tokens_seen": 1126769664 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033261785356068205, + "loss": 2.9218, + "theoretical_loss": 3.608408184777958, + "tokens_seen": 1126835200 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003326078234704112, + "loss": 2.9887, + "theoretical_loss": 3.6083886519474513, + "tokens_seen": 1126900736 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003325977933801404, + "loss": 2.8851, + "theoretical_loss": 3.6083691205709076, + "tokens_seen": 1126966272 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033258776328986964, + "loss": 2.8814, + "theoretical_loss": 3.608349590648134, + "tokens_seen": 1127031808 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003325777331995988, + "loss": 2.871, + "theoretical_loss": 3.608330062178937, + "tokens_seen": 1127097344 + }, + { + "epoch": 3.03, + "learning_rate": 0.000332567703109328, + "loss": 2.7574, + "theoretical_loss": 3.608310535163125, + "tokens_seen": 1127162880 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1806806, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.030730962753296, + "objective/train/theoretical_loss": 3.608305653636241, + "objective/train/tokens_used": 1147639264, + "theoretical_loss": 3.608305653636241, + "tokens_seen": 1127179264 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003325576730190572, + "loss": 3.1183, + "theoretical_loss": 3.6082910096005048, + "tokens_seen": 1127228416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033254764292878637, + "loss": 2.8586, + "theoretical_loss": 3.6082714854908833, + "tokens_seen": 1127293952 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003325376128385156, + "loss": 2.8539, + "theoretical_loss": 3.608251962834069, + "tokens_seen": 1127359488 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033252758274824473, + "loss": 2.8002, + "theoretical_loss": 3.6082324416298683, + "tokens_seen": 1127425024 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033251755265797396, + "loss": 2.8257, + "theoretical_loss": 3.6082129218780894, + "tokens_seen": 1127490560 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003325075225677031, + "loss": 2.8686, + "theoretical_loss": 3.6081934035785395, + "tokens_seen": 1127556096 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033249749247743233, + "loss": 2.8168, + "theoretical_loss": 3.608173886731026, + "tokens_seen": 1127621632 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003324874623871615, + "loss": 3.003, + "theoretical_loss": 3.608154371335357, + "tokens_seen": 1127687168 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003324774322968907, + "loss": 2.811, + "theoretical_loss": 3.6081348573913394, + "tokens_seen": 1127752704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033246740220661987, + "loss": 2.9033, + "theoretical_loss": 3.6081153448987813, + "tokens_seen": 1127818240 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033245737211634905, + "loss": 2.9073, + "theoretical_loss": 3.608095833857491, + "tokens_seen": 1127883776 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033244734202607823, + "loss": 2.8744, + "theoretical_loss": 3.6080763242672753, + "tokens_seen": 1127949312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033243731193580747, + "loss": 2.7879, + "theoretical_loss": 3.6080568161279425, + "tokens_seen": 1128014848 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003324272818455366, + "loss": 2.7602, + "theoretical_loss": 3.6080373094393003, + "tokens_seen": 1128080384 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033241725175526583, + "loss": 2.8446, + "theoretical_loss": 3.6080178042011566, + "tokens_seen": 1128145920 + }, + { + "epoch": 3.03, + "learning_rate": 0.000332407221664995, + "loss": 2.8765, + "theoretical_loss": 3.6079983004133185, + "tokens_seen": 1128211456 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003323971915747242, + "loss": 2.8824, + "theoretical_loss": 3.607978798075595, + "tokens_seen": 1128276992 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003323871614844534, + "loss": 2.8164, + "theoretical_loss": 3.607959297187794, + "tokens_seen": 1128342528 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033237713139418255, + "loss": 3.0241, + "theoretical_loss": 3.607939797749723, + "tokens_seen": 1128408064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033236710130391174, + "loss": 2.9629, + "theoretical_loss": 3.60792029976119, + "tokens_seen": 1128473600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033235707121364097, + "loss": 2.9703, + "theoretical_loss": 3.6079008032220035, + "tokens_seen": 1128539136 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003323470411233701, + "loss": 2.9163, + "theoretical_loss": 3.6078813081319714, + "tokens_seen": 1128604672 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033233701103309933, + "loss": 2.7135, + "theoretical_loss": 3.607861814490902, + "tokens_seen": 1128670208 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033232698094282846, + "loss": 2.899, + "theoretical_loss": 3.6078423222986036, + "tokens_seen": 1128735744 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003323169508525577, + "loss": 2.769, + "theoretical_loss": 3.6078228315548833, + "tokens_seen": 1128801280 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1809677, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.831899404525757, + "objective/train/theoretical_loss": 3.6078179590952715, + "objective/train/tokens_used": 1149277664, + "theoretical_loss": 3.6078179590952715, + "tokens_seen": 1128817664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003323069207622869, + "loss": 2.9351, + "theoretical_loss": 3.607803342259551, + "tokens_seen": 1128866816 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033229689067201606, + "loss": 2.8968, + "theoretical_loss": 3.607783854412414, + "tokens_seen": 1128932352 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033228686058174524, + "loss": 2.9732, + "theoretical_loss": 3.6077643680132807, + "tokens_seen": 1128997888 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003322768304914744, + "loss": 3.0027, + "theoretical_loss": 3.60774488306196, + "tokens_seen": 1129063424 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003322668004012036, + "loss": 2.8584, + "theoretical_loss": 3.6077253995582588, + "tokens_seen": 1129128960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033225677031093284, + "loss": 2.89, + "theoretical_loss": 3.6077059175019874, + "tokens_seen": 1129194496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033224674022066196, + "loss": 2.8563, + "theoretical_loss": 3.607686436892954, + "tokens_seen": 1129260032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003322367101303912, + "loss": 2.6832, + "theoretical_loss": 3.6076669577309657, + "tokens_seen": 1129325568 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003322266800401204, + "loss": 2.8976, + "theoretical_loss": 3.6076474800158325, + "tokens_seen": 1129391104 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033221664994984956, + "loss": 2.8542, + "theoretical_loss": 3.6076280037473625, + "tokens_seen": 1129456640 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033220661985957874, + "loss": 2.866, + "theoretical_loss": 3.607608528925364, + "tokens_seen": 1129522176 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003321965897693079, + "loss": 3.0947, + "theoretical_loss": 3.6075890555496457, + "tokens_seen": 1129587712 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003321865596790371, + "loss": 2.8627, + "theoretical_loss": 3.607569583620017, + "tokens_seen": 1129653248 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033217652958876634, + "loss": 2.754, + "theoretical_loss": 3.607550113136286, + "tokens_seen": 1129718784 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033216649949849547, + "loss": 2.8542, + "theoretical_loss": 3.6075306440982615, + "tokens_seen": 1129784320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003321564694082247, + "loss": 2.802, + "theoretical_loss": 3.607511176505753, + "tokens_seen": 1129849856 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033214643931795383, + "loss": 2.7982, + "theoretical_loss": 3.607491710358568, + "tokens_seen": 1129915392 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033213640922768306, + "loss": 2.7857, + "theoretical_loss": 3.607472245656516, + "tokens_seen": 1129980928 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033212637913741225, + "loss": 2.8629, + "theoretical_loss": 3.607452782399407, + "tokens_seen": 1130046464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003321163490471414, + "loss": 2.9664, + "theoretical_loss": 3.607433320587048, + "tokens_seen": 1130112000 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003321063189568706, + "loss": 2.8581, + "theoretical_loss": 3.6074138602192494, + "tokens_seen": 1130177536 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033209628886659984, + "loss": 2.889, + "theoretical_loss": 3.60739440129582, + "tokens_seen": 1130243072 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033208625877632897, + "loss": 2.8316, + "theoretical_loss": 3.607374943816568, + "tokens_seen": 1130308608 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003320762286860582, + "loss": 2.9388, + "theoretical_loss": 3.607355487781304, + "tokens_seen": 1130374144 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033206619859578733, + "loss": 2.794, + "theoretical_loss": 3.607336033189836, + "tokens_seen": 1130439680 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1812529, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.907589912414551, + "objective/train/theoretical_loss": 3.6073311697675394, + "objective/train/tokens_used": 1150916064, + "theoretical_loss": 3.6073311697675394, + "tokens_seen": 1130456064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033205616850551657, + "loss": 2.9175, + "theoretical_loss": 3.607316580041973, + "tokens_seen": 1130505216 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033204613841524575, + "loss": 2.8035, + "theoretical_loss": 3.607297128337525, + "tokens_seen": 1130570752 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033203610832497493, + "loss": 2.7513, + "theoretical_loss": 3.6072776780763007, + "tokens_seen": 1130636288 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003320260782347041, + "loss": 2.6758, + "theoretical_loss": 3.60725822925811, + "tokens_seen": 1130701824 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003320160481444333, + "loss": 2.9639, + "theoretical_loss": 3.607238781882761, + "tokens_seen": 1130767360 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003320060180541625, + "loss": 2.7504, + "theoretical_loss": 3.6072193359500644, + "tokens_seen": 1130832896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003319959879638917, + "loss": 2.9167, + "theoretical_loss": 3.6071998914598287, + "tokens_seen": 1130898432 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033198595787362084, + "loss": 2.8091, + "theoretical_loss": 3.6071804484118637, + "tokens_seen": 1130963968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033197592778335007, + "loss": 2.7107, + "theoretical_loss": 3.607161006805979, + "tokens_seen": 1131029504 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003319658976930792, + "loss": 2.828, + "theoretical_loss": 3.6071415666419835, + "tokens_seen": 1131095040 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033195586760280843, + "loss": 2.8371, + "theoretical_loss": 3.6071221279196872, + "tokens_seen": 1131160576 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003319458375125376, + "loss": 2.8264, + "theoretical_loss": 3.6071026906388997, + "tokens_seen": 1131226112 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003319358074222668, + "loss": 3.0165, + "theoretical_loss": 3.6070832547994303, + "tokens_seen": 1131291648 + }, + { + "epoch": 3.03, + "learning_rate": 0.000331925777331996, + "loss": 2.9839, + "theoretical_loss": 3.6070638204010885, + "tokens_seen": 1131357184 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003319157472417252, + "loss": 2.9569, + "theoretical_loss": 3.607044387443685, + "tokens_seen": 1131422720 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033190571715145434, + "loss": 2.758, + "theoretical_loss": 3.607024955927028, + "tokens_seen": 1131488256 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318956870611836, + "loss": 2.9222, + "theoretical_loss": 3.6070055258509286, + "tokens_seen": 1131553792 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318856569709127, + "loss": 2.8437, + "theoretical_loss": 3.606986097215196, + "tokens_seen": 1131619328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033187562688064194, + "loss": 2.9078, + "theoretical_loss": 3.6069666700196397, + "tokens_seen": 1131684864 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318655967903711, + "loss": 2.761, + "theoretical_loss": 3.6069472442640693, + "tokens_seen": 1131750400 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318555667001003, + "loss": 2.8836, + "theoretical_loss": 3.606927819948296, + "tokens_seen": 1131815936 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318455366098295, + "loss": 2.8645, + "theoretical_loss": 3.606908397072129, + "tokens_seen": 1131881472 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033183550651955866, + "loss": 2.849, + "theoretical_loss": 3.6068889756353784, + "tokens_seen": 1131947008 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318254764292879, + "loss": 2.7917, + "theoretical_loss": 3.6068695556378536, + "tokens_seen": 1132012544 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003318154463390171, + "loss": 2.7203, + "theoretical_loss": 3.6068501370793653, + "tokens_seen": 1132078080 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1813896, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.703699827194214, + "objective/train/theoretical_loss": 3.60684528266457, + "objective/train/tokens_used": 1152554464, + "theoretical_loss": 3.60684528266457, + "tokens_seen": 1132094464 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033180541624874626, + "loss": 2.9375, + "theoretical_loss": 3.606830719959723, + "tokens_seen": 1132143616 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033179538615847544, + "loss": 2.8448, + "theoretical_loss": 3.6068113042787373, + "tokens_seen": 1132209152 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003317853560682046, + "loss": 3.0409, + "theoretical_loss": 3.6067918900362184, + "tokens_seen": 1132274688 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003317753259779338, + "loss": 2.7597, + "theoretical_loss": 3.606772477231976, + "tokens_seen": 1132340224 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033176529588766304, + "loss": 2.8492, + "theoretical_loss": 3.6067530658658207, + "tokens_seen": 1132405760 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033175526579739216, + "loss": 2.9724, + "theoretical_loss": 3.6067336559375627, + "tokens_seen": 1132471296 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003317452357071214, + "loss": 2.8226, + "theoretical_loss": 3.606714247447012, + "tokens_seen": 1132536832 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003317352056168506, + "loss": 2.9025, + "theoretical_loss": 3.6066948403939794, + "tokens_seen": 1132602368 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033172517552657976, + "loss": 2.9854, + "theoretical_loss": 3.606675434778275, + "tokens_seen": 1132667904 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033171514543630894, + "loss": 2.8469, + "theoretical_loss": 3.6066560305997086, + "tokens_seen": 1132733440 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003317051153460381, + "loss": 2.8222, + "theoretical_loss": 3.606636627858092, + "tokens_seen": 1132798976 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003316950852557673, + "loss": 2.814, + "theoretical_loss": 3.606617226553235, + "tokens_seen": 1132864512 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033168505516549654, + "loss": 2.9349, + "theoretical_loss": 3.606597826684947, + "tokens_seen": 1132930048 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033167502507522567, + "loss": 2.8914, + "theoretical_loss": 3.60657842825304, + "tokens_seen": 1132995584 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003316649949849549, + "loss": 2.894, + "theoretical_loss": 3.6065590312573246, + "tokens_seen": 1133061120 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033165496489468403, + "loss": 2.9824, + "theoretical_loss": 3.6065396356976103, + "tokens_seen": 1133126656 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033164493480441326, + "loss": 2.9678, + "theoretical_loss": 3.6065202415737083, + "tokens_seen": 1133192192 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033163490471414245, + "loss": 2.7228, + "theoretical_loss": 3.60650084888543, + "tokens_seen": 1133257728 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003316248746238716, + "loss": 2.9014, + "theoretical_loss": 3.6064814576325848, + "tokens_seen": 1133323264 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003316148445336008, + "loss": 2.7578, + "theoretical_loss": 3.6064620678149844, + "tokens_seen": 1133388800 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033160481444333004, + "loss": 2.9974, + "theoretical_loss": 3.6064426794324396, + "tokens_seen": 1133454336 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033159478435305917, + "loss": 2.8193, + "theoretical_loss": 3.6064232924847603, + "tokens_seen": 1133519872 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003315847542627884, + "loss": 3.0107, + "theoretical_loss": 3.6064039069717584, + "tokens_seen": 1133585408 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033157472417251753, + "loss": 2.7404, + "theoretical_loss": 3.606384522893244, + "tokens_seen": 1133650944 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033156469408224677, + "loss": 2.872, + "theoretical_loss": 3.6063651402490287, + "tokens_seen": 1133716480 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1816707, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.970165252685547, + "objective/train/theoretical_loss": 3.606360294812062, + "objective/train/tokens_used": 1154192864, + "theoretical_loss": 3.606360294812062, + "tokens_seen": 1133732864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033155466399197595, + "loss": 3.0682, + "theoretical_loss": 3.6063457590389234, + "tokens_seen": 1133782016 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033154463390170513, + "loss": 2.7868, + "theoretical_loss": 3.6063263792627382, + "tokens_seen": 1133847552 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003315346038114343, + "loss": 2.8669, + "theoretical_loss": 3.6063070009202853, + "tokens_seen": 1133913088 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003315245737211635, + "loss": 2.8394, + "theoretical_loss": 3.606287624011375, + "tokens_seen": 1133978624 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003315145436308927, + "loss": 2.8455, + "theoretical_loss": 3.6062682485358195, + "tokens_seen": 1134044160 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003315045135406219, + "loss": 2.8596, + "theoretical_loss": 3.6062488744934287, + "tokens_seen": 1134109696 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033149448345035104, + "loss": 2.8522, + "theoretical_loss": 3.606229501884014, + "tokens_seen": 1134175232 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033148445336008027, + "loss": 2.7258, + "theoretical_loss": 3.606210130707387, + "tokens_seen": 1134240768 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003314744232698094, + "loss": 2.9181, + "theoretical_loss": 3.6061907609633588, + "tokens_seen": 1134306304 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033146439317953863, + "loss": 2.8346, + "theoretical_loss": 3.6061713926517407, + "tokens_seen": 1134371840 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003314543630892678, + "loss": 2.9933, + "theoretical_loss": 3.606152025772344, + "tokens_seen": 1134437376 + }, + { + "epoch": 3.03, + "learning_rate": 0.000331444332998997, + "loss": 2.789, + "theoretical_loss": 3.60613266032498, + "tokens_seen": 1134502912 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003314343029087262, + "loss": 2.8516, + "theoretical_loss": 3.6061132963094606, + "tokens_seen": 1134568448 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003314242728184554, + "loss": 2.8963, + "theoretical_loss": 3.606093933725597, + "tokens_seen": 1134633984 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033141424272818454, + "loss": 2.862, + "theoretical_loss": 3.6060745725732, + "tokens_seen": 1134699520 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003314042126379138, + "loss": 2.7605, + "theoretical_loss": 3.6060552128520813, + "tokens_seen": 1134765056 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313941825476429, + "loss": 2.7751, + "theoretical_loss": 3.6060358545620534, + "tokens_seen": 1134830592 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033138415245737214, + "loss": 2.9188, + "theoretical_loss": 3.606016497702927, + "tokens_seen": 1134896128 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313741223671013, + "loss": 2.8015, + "theoretical_loss": 3.605997142274514, + "tokens_seen": 1134961664 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313640922768305, + "loss": 2.9361, + "theoretical_loss": 3.605977788276626, + "tokens_seen": 1135027200 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313540621865597, + "loss": 2.8395, + "theoretical_loss": 3.6059584357090744, + "tokens_seen": 1135092736 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033134403209628886, + "loss": 2.8918, + "theoretical_loss": 3.6059390845716717, + "tokens_seen": 1135158272 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033133400200601804, + "loss": 2.974, + "theoretical_loss": 3.6059197348642282, + "tokens_seen": 1135223808 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313239719157473, + "loss": 2.8058, + "theoretical_loss": 3.6059003865865575, + "tokens_seen": 1135289344 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003313139418254764, + "loss": 2.7979, + "theoretical_loss": 3.6058810397384704, + "tokens_seen": 1135354880 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1819465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8530046939849854, + "objective/train/theoretical_loss": 3.605876203249799, + "objective/train/tokens_used": 1155831264, + "theoretical_loss": 3.605876203249799, + "tokens_seen": 1135371264 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033130391173520564, + "loss": 2.8237, + "theoretical_loss": 3.605861694319779, + "tokens_seen": 1135420416 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033129388164493477, + "loss": 3.0015, + "theoretical_loss": 3.6058423503302945, + "tokens_seen": 1135485952 + }, + { + "epoch": 3.03, + "learning_rate": 0.000331283851554664, + "loss": 2.8749, + "theoretical_loss": 3.60582300776983, + "tokens_seen": 1135551488 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003312738214643932, + "loss": 2.8074, + "theoretical_loss": 3.605803666638197, + "tokens_seen": 1135617024 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033126379137412236, + "loss": 2.7951, + "theoretical_loss": 3.6057843269352072, + "tokens_seen": 1135682560 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033125376128385155, + "loss": 2.9112, + "theoretical_loss": 3.605764988660673, + "tokens_seen": 1135748096 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003312437311935808, + "loss": 2.9728, + "theoretical_loss": 3.6057456518144066, + "tokens_seen": 1135813632 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003312337011033099, + "loss": 2.7427, + "theoretical_loss": 3.605726316396219, + "tokens_seen": 1135879168 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033122367101303914, + "loss": 2.7723, + "theoretical_loss": 3.605706982405924, + "tokens_seen": 1135944704 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033121364092276827, + "loss": 2.8708, + "theoretical_loss": 3.6056876498433326, + "tokens_seen": 1136010240 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003312036108324975, + "loss": 2.8693, + "theoretical_loss": 3.6056683187082577, + "tokens_seen": 1136075776 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003311935807422267, + "loss": 2.822, + "theoretical_loss": 3.605648989000511, + "tokens_seen": 1136141312 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033118355065195587, + "loss": 2.7495, + "theoretical_loss": 3.605629660719905, + "tokens_seen": 1136206848 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033117352056168505, + "loss": 2.6077, + "theoretical_loss": 3.605610333866252, + "tokens_seen": 1136272384 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033116349047141423, + "loss": 2.6485, + "theoretical_loss": 3.6055910084393643, + "tokens_seen": 1136337920 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003311534603811434, + "loss": 2.748, + "theoretical_loss": 3.605571684439055, + "tokens_seen": 1136403456 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033114343029087265, + "loss": 2.7643, + "theoretical_loss": 3.6055523618651355, + "tokens_seen": 1136468992 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033113340020060177, + "loss": 2.7264, + "theoretical_loss": 3.605533040717419, + "tokens_seen": 1136534528 + }, + { + "epoch": 3.03, + "learning_rate": 0.000331123370110331, + "loss": 2.743, + "theoretical_loss": 3.6055137209957175, + "tokens_seen": 1136600064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033111334002006013, + "loss": 3.0171, + "theoretical_loss": 3.6054944026998434, + "tokens_seen": 1136665600 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033110330992978937, + "loss": 2.8908, + "theoretical_loss": 3.6054750858296103, + "tokens_seen": 1136731136 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033109327983951855, + "loss": 2.8347, + "theoretical_loss": 3.60545577038483, + "tokens_seen": 1136796672 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033108324974924773, + "loss": 2.6987, + "theoretical_loss": 3.6054364563653154, + "tokens_seen": 1136862208 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033107321965897697, + "loss": 2.9976, + "theoretical_loss": 3.6054171437708784, + "tokens_seen": 1136927744 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033106318956870615, + "loss": 2.8431, + "theoretical_loss": 3.6053978326013327, + "tokens_seen": 1136993280 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1821895, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7701449394226074, + "objective/train/theoretical_loss": 3.605393005031564, + "objective/train/tokens_used": 1157469664, + "theoretical_loss": 3.605393005031564, + "tokens_seen": 1137009664 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033105315947843533, + "loss": 2.81, + "theoretical_loss": 3.6053785228564914, + "tokens_seen": 1137058816 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003310431293881645, + "loss": 2.7846, + "theoretical_loss": 3.6053592145361657, + "tokens_seen": 1137124352 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003310330992978937, + "loss": 2.7774, + "theoretical_loss": 3.6053399076401695, + "tokens_seen": 1137189888 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003310230692076229, + "loss": 2.9367, + "theoretical_loss": 3.605320602168316, + "tokens_seen": 1137255424 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003310130391173521, + "loss": 2.877, + "theoretical_loss": 3.605301298120417, + "tokens_seen": 1137320960 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033100300902708124, + "loss": 2.8223, + "theoretical_loss": 3.605281995496286, + "tokens_seen": 1137386496 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033099297893681047, + "loss": 2.8726, + "theoretical_loss": 3.605262694295736, + "tokens_seen": 1137452032 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003309829488465396, + "loss": 2.7907, + "theoretical_loss": 3.6052433945185802, + "tokens_seen": 1137517568 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033097291875626883, + "loss": 2.846, + "theoretical_loss": 3.6052240961646316, + "tokens_seen": 1137583104 + }, + { + "epoch": 3.03, + "learning_rate": 0.000330962888665998, + "loss": 2.8453, + "theoretical_loss": 3.605204799233703, + "tokens_seen": 1137648640 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003309528585757272, + "loss": 2.8155, + "theoretical_loss": 3.605185503725607, + "tokens_seen": 1137714176 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003309428284854564, + "loss": 2.8554, + "theoretical_loss": 3.6051662096401573, + "tokens_seen": 1137779712 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003309327983951856, + "loss": 2.7552, + "theoretical_loss": 3.605146916977168, + "tokens_seen": 1137845248 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033092276830491474, + "loss": 2.8402, + "theoretical_loss": 3.6051276257364506, + "tokens_seen": 1137910784 + }, + { + "epoch": 3.03, + "learning_rate": 0.000330912738214644, + "loss": 2.7458, + "theoretical_loss": 3.6051083359178193, + "tokens_seen": 1137976320 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003309027081243731, + "loss": 2.9433, + "theoretical_loss": 3.6050890475210875, + "tokens_seen": 1138041856 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033089267803410234, + "loss": 2.8614, + "theoretical_loss": 3.605069760546068, + "tokens_seen": 1138107392 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003308826479438315, + "loss": 2.903, + "theoretical_loss": 3.6050504749925745, + "tokens_seen": 1138172928 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003308726178535607, + "loss": 2.8845, + "theoretical_loss": 3.6050311908604202, + "tokens_seen": 1138238464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003308625877632899, + "loss": 2.865, + "theoretical_loss": 3.605011908149418, + "tokens_seen": 1138304000 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033085255767301906, + "loss": 2.7613, + "theoretical_loss": 3.6049926268593824, + "tokens_seen": 1138369536 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033084252758274824, + "loss": 2.9489, + "theoretical_loss": 3.6049733469901266, + "tokens_seen": 1138435072 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003308324974924775, + "loss": 2.7784, + "theoretical_loss": 3.604954068541464, + "tokens_seen": 1138500608 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003308224674022066, + "loss": 2.9328, + "theoretical_loss": 3.604934791513208, + "tokens_seen": 1138566144 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033081243731193584, + "loss": 2.7694, + "theoretical_loss": 3.604915515905172, + "tokens_seen": 1138631680 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1824687, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.847928285598755, + "objective/train/theoretical_loss": 3.6049106972250504, + "objective/train/tokens_used": 1159108064, + "theoretical_loss": 3.6049106972250504, + "tokens_seen": 1138648064 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033080240722166497, + "loss": 2.8513, + "theoretical_loss": 3.6048962417171704, + "tokens_seen": 1138697216 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003307923771313942, + "loss": 2.7355, + "theoretical_loss": 3.604876968949016, + "tokens_seen": 1138762752 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003307823470411234, + "loss": 2.7304, + "theoretical_loss": 3.604857697600523, + "tokens_seen": 1138828288 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033077231695085256, + "loss": 2.9483, + "theoretical_loss": 3.6048384276715053, + "tokens_seen": 1138893824 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033076228686058175, + "loss": 2.8566, + "theoretical_loss": 3.604819159161776, + "tokens_seen": 1138959360 + }, + { + "epoch": 3.03, + "learning_rate": 0.000330752256770311, + "loss": 2.9669, + "theoretical_loss": 3.60479989207115, + "tokens_seen": 1139024896 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003307422266800401, + "loss": 2.9638, + "theoretical_loss": 3.6047806263994397, + "tokens_seen": 1139090432 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033073219658976934, + "loss": 2.8907, + "theoretical_loss": 3.6047613621464607, + "tokens_seen": 1139155968 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033072216649949847, + "loss": 2.7946, + "theoretical_loss": 3.604742099312025, + "tokens_seen": 1139221504 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003307121364092277, + "loss": 2.858, + "theoretical_loss": 3.6047228378959484, + "tokens_seen": 1139287040 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003307021063189569, + "loss": 2.9972, + "theoretical_loss": 3.6047035778980434, + "tokens_seen": 1139352576 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033069207622868607, + "loss": 2.9062, + "theoretical_loss": 3.6046843193181246, + "tokens_seen": 1139418112 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033068204613841525, + "loss": 2.9272, + "theoretical_loss": 3.6046650621560063, + "tokens_seen": 1139483648 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033067201604814443, + "loss": 2.8283, + "theoretical_loss": 3.6046458064115026, + "tokens_seen": 1139549184 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003306619859578736, + "loss": 2.9087, + "theoretical_loss": 3.6046265520844267, + "tokens_seen": 1139614720 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033065195586760285, + "loss": 2.9068, + "theoretical_loss": 3.604607299174594, + "tokens_seen": 1139680256 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033064192577733197, + "loss": 2.8952, + "theoretical_loss": 3.604588047681818, + "tokens_seen": 1139745792 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003306318956870612, + "loss": 2.7917, + "theoretical_loss": 3.6045687976059133, + "tokens_seen": 1139811328 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033062186559679034, + "loss": 2.9507, + "theoretical_loss": 3.604549548946694, + "tokens_seen": 1139876864 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033061183550651957, + "loss": 2.8196, + "theoretical_loss": 3.6045303017039734, + "tokens_seen": 1139942400 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033060180541624875, + "loss": 2.8573, + "theoretical_loss": 3.6045110558775675, + "tokens_seen": 1140007936 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033059177532597793, + "loss": 2.9068, + "theoretical_loss": 3.60449181146729, + "tokens_seen": 1140073472 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003305817452357071, + "loss": 2.6809, + "theoretical_loss": 3.604472568472955, + "tokens_seen": 1140139008 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033057171514543635, + "loss": 2.8278, + "theoretical_loss": 3.604453326894377, + "tokens_seen": 1140204544 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003305616850551655, + "loss": 2.8698, + "theoretical_loss": 3.604434086731371, + "tokens_seen": 1140270080 + }, + { + "epoch": 3.03, + "objective/train/docs_used": 1827284, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9009814262390137, + "objective/train/theoretical_loss": 3.6044292769117807, + "objective/train/tokens_used": 1160746464, + "theoretical_loss": 3.6044292769117807, + "tokens_seen": 1140286464 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003305516549648947, + "loss": 2.953, + "theoretical_loss": 3.604414847983751, + "tokens_seen": 1140335616 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033054162487462384, + "loss": 2.7957, + "theoretical_loss": 3.6043956106513315, + "tokens_seen": 1140401152 + }, + { + "epoch": 3.03, + "learning_rate": 0.0003305315947843531, + "loss": 2.7181, + "theoretical_loss": 3.6043763747339277, + "tokens_seen": 1140466688 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033052156469408225, + "loss": 3.0333, + "theoretical_loss": 3.604357140231354, + "tokens_seen": 1140532224 + }, + { + "epoch": 3.03, + "learning_rate": 0.00033051153460381144, + "loss": 2.885, + "theoretical_loss": 3.604337907143424, + "tokens_seen": 1140597760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003305015045135406, + "loss": 2.9376, + "theoretical_loss": 3.604318675469954, + "tokens_seen": 1140663296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304914744232698, + "loss": 2.7464, + "theoretical_loss": 3.6042994452107573, + "tokens_seen": 1140728832 + }, + { + "epoch": 3.04, + "learning_rate": 0.000330481444332999, + "loss": 2.8083, + "theoretical_loss": 3.6042802163656495, + "tokens_seen": 1140794368 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304714142427282, + "loss": 2.7755, + "theoretical_loss": 3.6042609889344455, + "tokens_seen": 1140859904 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033046138415245734, + "loss": 2.9326, + "theoretical_loss": 3.60424176291696, + "tokens_seen": 1140925440 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304513540621866, + "loss": 2.8851, + "theoretical_loss": 3.6042225383130067, + "tokens_seen": 1140990976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304413239719157, + "loss": 2.7727, + "theoretical_loss": 3.604203315122402, + "tokens_seen": 1141056512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033043129388164494, + "loss": 2.864, + "theoretical_loss": 3.6041840933449603, + "tokens_seen": 1141122048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304212637913741, + "loss": 2.6859, + "theoretical_loss": 3.6041648729804967, + "tokens_seen": 1141187584 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304112337011033, + "loss": 2.9548, + "theoretical_loss": 3.6041456540288257, + "tokens_seen": 1141253120 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003304012036108325, + "loss": 3.0195, + "theoretical_loss": 3.6041264364897634, + "tokens_seen": 1141318656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303911735205617, + "loss": 2.8915, + "theoretical_loss": 3.6041072203631233, + "tokens_seen": 1141384192 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033038114343029084, + "loss": 2.9517, + "theoretical_loss": 3.6040880056487223, + "tokens_seen": 1141449728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303711133400201, + "loss": 2.696, + "theoretical_loss": 3.6040687923463737, + "tokens_seen": 1141515264 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303610832497492, + "loss": 3.0157, + "theoretical_loss": 3.604049580455894, + "tokens_seen": 1141580800 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033035105315947844, + "loss": 2.8795, + "theoretical_loss": 3.6040303699770977, + "tokens_seen": 1141646336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303410230692076, + "loss": 2.9179, + "theoretical_loss": 3.6040111609098004, + "tokens_seen": 1141711872 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303309929789368, + "loss": 2.834, + "theoretical_loss": 3.6039919532538174, + "tokens_seen": 1141777408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033032096288866604, + "loss": 2.9328, + "theoretical_loss": 3.6039727470089638, + "tokens_seen": 1141842944 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033031093279839517, + "loss": 3.0344, + "theoretical_loss": 3.6039535421750553, + "tokens_seen": 1141908480 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1830220, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0180392265319824, + "objective/train/theoretical_loss": 3.6039487411870166, + "objective/train/tokens_used": 1162384864, + "theoretical_loss": 3.6039487411870166, + "tokens_seen": 1141924864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003303009027081244, + "loss": 2.8123, + "theoretical_loss": 3.6039343387519067, + "tokens_seen": 1141974016 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003302908726178536, + "loss": 2.9543, + "theoretical_loss": 3.603915136739334, + "tokens_seen": 1142039552 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033028084252758276, + "loss": 2.8753, + "theoretical_loss": 3.6038959361371523, + "tokens_seen": 1142105088 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033027081243731195, + "loss": 2.9125, + "theoretical_loss": 3.603876736945177, + "tokens_seen": 1142170624 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003302607823470412, + "loss": 2.9042, + "theoretical_loss": 3.603857539163224, + "tokens_seen": 1142236160 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003302507522567703, + "loss": 2.8722, + "theoretical_loss": 3.6038383427911085, + "tokens_seen": 1142301696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033024072216649954, + "loss": 2.7368, + "theoretical_loss": 3.6038191478286463, + "tokens_seen": 1142367232 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033023069207622867, + "loss": 3.0493, + "theoretical_loss": 3.6037999542756527, + "tokens_seen": 1142432768 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003302206619859579, + "loss": 2.872, + "theoretical_loss": 3.6037807621319438, + "tokens_seen": 1142498304 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003302106318956871, + "loss": 2.9682, + "theoretical_loss": 3.603761571397335, + "tokens_seen": 1142563840 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033020060180541627, + "loss": 2.8885, + "theoretical_loss": 3.6037423820716428, + "tokens_seen": 1142629376 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033019057171514545, + "loss": 2.9105, + "theoretical_loss": 3.603723194154682, + "tokens_seen": 1142694912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033018054162487463, + "loss": 2.9397, + "theoretical_loss": 3.603704007646268, + "tokens_seen": 1142760448 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003301705115346038, + "loss": 2.8143, + "theoretical_loss": 3.603684822546218, + "tokens_seen": 1142825984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033016048144433305, + "loss": 2.8693, + "theoretical_loss": 3.6036656388543475, + "tokens_seen": 1142891520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003301504513540622, + "loss": 2.93, + "theoretical_loss": 3.6036464565704716, + "tokens_seen": 1142957056 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003301404212637914, + "loss": 2.8788, + "theoretical_loss": 3.6036272756944063, + "tokens_seen": 1143022592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033013039117352054, + "loss": 2.8394, + "theoretical_loss": 3.603608096225969, + "tokens_seen": 1143088128 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033012036108324977, + "loss": 2.6721, + "theoretical_loss": 3.603588918164974, + "tokens_seen": 1143153664 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033011033099297895, + "loss": 2.795, + "theoretical_loss": 3.603569741511238, + "tokens_seen": 1143219200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033010030090270813, + "loss": 2.8225, + "theoretical_loss": 3.6035505662645777, + "tokens_seen": 1143284736 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003300902708124373, + "loss": 2.9129, + "theoretical_loss": 3.6035313924248085, + "tokens_seen": 1143350272 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033008024072216655, + "loss": 2.8068, + "theoretical_loss": 3.603512219991746, + "tokens_seen": 1143415808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003300702106318957, + "loss": 2.8219, + "theoretical_loss": 3.6034930489652073, + "tokens_seen": 1143481344 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003300601805416249, + "loss": 2.9072, + "theoretical_loss": 3.6034738793450085, + "tokens_seen": 1143546880 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1831670, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.987501382827759, + "objective/train/theoretical_loss": 3.603469087159678, + "objective/train/tokens_used": 1164023264, + "theoretical_loss": 3.603469087159678, + "tokens_seen": 1143563264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033005015045135404, + "loss": 2.852, + "theoretical_loss": 3.603454711130966, + "tokens_seen": 1143612416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003300401203610833, + "loss": 3.0456, + "theoretical_loss": 3.6034355443228954, + "tokens_seen": 1143677952 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033003009027081245, + "loss": 2.8508, + "theoretical_loss": 3.603416378920614, + "tokens_seen": 1143743488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033002006018054164, + "loss": 3.0091, + "theoretical_loss": 3.603397214923937, + "tokens_seen": 1143809024 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003300100300902708, + "loss": 2.9834, + "theoretical_loss": 3.6033780523326815, + "tokens_seen": 1143874560 + }, + { + "epoch": 3.04, + "learning_rate": 0.00033, + "loss": 2.8397, + "theoretical_loss": 3.6033588911466636, + "tokens_seen": 1143940096 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299899699097292, + "loss": 2.7938, + "theoretical_loss": 3.6033397313657, + "tokens_seen": 1144005632 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299799398194584, + "loss": 3.0002, + "theoretical_loss": 3.603320572989607, + "tokens_seen": 1144071168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032996990972918754, + "loss": 2.755, + "theoretical_loss": 3.6033014160182013, + "tokens_seen": 1144136704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299598796389168, + "loss": 2.9398, + "theoretical_loss": 3.6032822604512997, + "tokens_seen": 1144202240 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299498495486459, + "loss": 2.984, + "theoretical_loss": 3.6032631062887184, + "tokens_seen": 1144267776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032993981945837514, + "loss": 2.8354, + "theoretical_loss": 3.603243953530274, + "tokens_seen": 1144333312 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299297893681043, + "loss": 2.978, + "theoretical_loss": 3.6032248021757827, + "tokens_seen": 1144398848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299197592778335, + "loss": 2.7383, + "theoretical_loss": 3.6032056522250624, + "tokens_seen": 1144464384 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003299097291875627, + "loss": 2.8859, + "theoretical_loss": 3.603186503677929, + "tokens_seen": 1144529920 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003298996990972919, + "loss": 2.7111, + "theoretical_loss": 3.6031673565341995, + "tokens_seen": 1144595456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032988966900702104, + "loss": 2.8018, + "theoretical_loss": 3.603148210793691, + "tokens_seen": 1144660992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003298796389167503, + "loss": 2.8943, + "theoretical_loss": 3.603129066456219, + "tokens_seen": 1144726528 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003298696088264794, + "loss": 2.9249, + "theoretical_loss": 3.603109923521602, + "tokens_seen": 1144792064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032985957873620864, + "loss": 2.8298, + "theoretical_loss": 3.603090781989656, + "tokens_seen": 1144857600 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003298495486459378, + "loss": 2.8737, + "theoretical_loss": 3.6030716418601987, + "tokens_seen": 1144923136 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329839518555667, + "loss": 2.8429, + "theoretical_loss": 3.6030525031330463, + "tokens_seen": 1144988672 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003298294884653962, + "loss": 2.9551, + "theoretical_loss": 3.603033365808016, + "tokens_seen": 1145054208 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032981945837512537, + "loss": 2.9303, + "theoretical_loss": 3.603014229884925, + "tokens_seen": 1145119744 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032980942828485455, + "loss": 2.9392, + "theoretical_loss": 3.60299509536359, + "tokens_seen": 1145185280 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1835787, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4409260749816895, + "objective/train/theoretical_loss": 3.6029903119522593, + "objective/train/tokens_used": 1165661664, + "theoretical_loss": 3.6029903119522593, + "tokens_seen": 1145201664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297993981945838, + "loss": 2.6882, + "theoretical_loss": 3.6029759622438284, + "tokens_seen": 1145250816 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297893681043129, + "loss": 2.8063, + "theoretical_loss": 3.6029568305254576, + "tokens_seen": 1145316352 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032977933801404215, + "loss": 2.9051, + "theoretical_loss": 3.6029377002082943, + "tokens_seen": 1145381888 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297693079237713, + "loss": 2.8476, + "theoretical_loss": 3.6029185712921556, + "tokens_seen": 1145447424 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297592778335005, + "loss": 2.9489, + "theoretical_loss": 3.6028994437768596, + "tokens_seen": 1145512960 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297492477432297, + "loss": 2.7998, + "theoretical_loss": 3.602880317662223, + "tokens_seen": 1145578496 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032973921765295887, + "loss": 2.9603, + "theoretical_loss": 3.6028611929480627, + "tokens_seen": 1145644032 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032972918756268805, + "loss": 2.893, + "theoretical_loss": 3.602842069634197, + "tokens_seen": 1145709568 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297191574724173, + "loss": 2.7799, + "theoretical_loss": 3.6028229477204423, + "tokens_seen": 1145775104 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003297091273821464, + "loss": 2.8448, + "theoretical_loss": 3.602803827206617, + "tokens_seen": 1145840640 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032969909729187565, + "loss": 2.7254, + "theoretical_loss": 3.6027847080925377, + "tokens_seen": 1145906176 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003296890672016048, + "loss": 2.8773, + "theoretical_loss": 3.6027655903780222, + "tokens_seen": 1145971712 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329679037111334, + "loss": 2.8757, + "theoretical_loss": 3.6027464740628883, + "tokens_seen": 1146037248 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003296690070210632, + "loss": 2.8496, + "theoretical_loss": 3.602727359146953, + "tokens_seen": 1146102784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003296589769307924, + "loss": 2.8909, + "theoretical_loss": 3.6027082456300343, + "tokens_seen": 1146168320 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032964894684052155, + "loss": 2.8582, + "theoretical_loss": 3.6026891335119497, + "tokens_seen": 1146233856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032963891675025074, + "loss": 2.7211, + "theoretical_loss": 3.602670022792517, + "tokens_seen": 1146299392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003296288866599799, + "loss": 2.9074, + "theoretical_loss": 3.602650913471554, + "tokens_seen": 1146364928 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032961885656970915, + "loss": 2.8434, + "theoretical_loss": 3.602631805548878, + "tokens_seen": 1146430464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003296088264794383, + "loss": 2.9457, + "theoretical_loss": 3.6026126990243066, + "tokens_seen": 1146496000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295987963891675, + "loss": 2.8701, + "theoretical_loss": 3.602593593897658, + "tokens_seen": 1146561536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295887662988967, + "loss": 2.9762, + "theoretical_loss": 3.6025744901687498, + "tokens_seen": 1146627072 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295787362086259, + "loss": 2.8216, + "theoretical_loss": 3.6025553878374, + "tokens_seen": 1146692608 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295687061183551, + "loss": 2.8974, + "theoretical_loss": 3.602536286903427, + "tokens_seen": 1146758144 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032955867602808424, + "loss": 2.8563, + "theoretical_loss": 3.602517187366648, + "tokens_seen": 1146823680 + }, + { + "debugging/Self-BLEU-5": 0.5488437014905663, + "debugging/distinct-1-grams": 0.733128090282939, + "debugging/distinct-2-grams": 0.9408243528239506, + "debugging/entropy-1-grams": 6.162225805344617, + "debugging/entropy-2-grams": 7.208529640127749, + "debugging/length": 519.8, + "debugging/num_segments": 20, + "epoch": 3.04, + "objective/train/docs_used": 1837145, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.684386730194092, + "objective/train/theoretical_loss": 3.6025124127007437, + "objective/train/tokens_used": 1167300064, + "theoretical_loss": 3.6025124127007437, + "tokens_seen": 1146840064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295486459378135, + "loss": 2.8641, + "theoretical_loss": 3.6024980892268816, + "tokens_seen": 1146889216 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032953861584754265, + "loss": 2.8913, + "theoretical_loss": 3.602478992483945, + "tokens_seen": 1146954752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032952858575727184, + "loss": 2.9096, + "theoretical_loss": 3.6024598971376562, + "tokens_seen": 1147020288 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329518555667001, + "loss": 2.9093, + "theoretical_loss": 3.6024408031878346, + "tokens_seen": 1147085824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003295085255767302, + "loss": 2.9536, + "theoretical_loss": 3.602421710634297, + "tokens_seen": 1147151360 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294984954864594, + "loss": 2.7322, + "theoretical_loss": 3.602402619476862, + "tokens_seen": 1147216896 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294884653961886, + "loss": 2.7622, + "theoretical_loss": 3.6023835297153473, + "tokens_seen": 1147282432 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032947843530591774, + "loss": 2.7833, + "theoretical_loss": 3.6023644413495717, + "tokens_seen": 1147347968 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329468405215647, + "loss": 2.9626, + "theoretical_loss": 3.602345354379353, + "tokens_seen": 1147413504 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294583751253761, + "loss": 2.9815, + "theoretical_loss": 3.602326268804511, + "tokens_seen": 1147479040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032944834503510534, + "loss": 2.8031, + "theoretical_loss": 3.6023071846248613, + "tokens_seen": 1147544576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294383149448345, + "loss": 2.8151, + "theoretical_loss": 3.602288101840224, + "tokens_seen": 1147610112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294282848545637, + "loss": 2.8807, + "theoretical_loss": 3.602269020450417, + "tokens_seen": 1147675648 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294182547642929, + "loss": 2.7876, + "theoretical_loss": 3.602249940455259, + "tokens_seen": 1147741184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003294082246740221, + "loss": 2.6299, + "theoretical_loss": 3.6022308618545686, + "tokens_seen": 1147806720 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032939819458375124, + "loss": 2.8509, + "theoretical_loss": 3.602211784648164, + "tokens_seen": 1147872256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003293881644934805, + "loss": 2.7676, + "theoretical_loss": 3.602192708835863, + "tokens_seen": 1147937792 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003293781344032096, + "loss": 2.8534, + "theoretical_loss": 3.6021736344174853, + "tokens_seen": 1148003328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032936810431293884, + "loss": 2.9702, + "theoretical_loss": 3.6021545613928483, + "tokens_seen": 1148068864 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329358074222668, + "loss": 3.0513, + "theoretical_loss": 3.602135489761772, + "tokens_seen": 1148134400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003293480441323972, + "loss": 2.9786, + "theoretical_loss": 3.602116419524074, + "tokens_seen": 1148199936 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003293380140421264, + "loss": 2.975, + "theoretical_loss": 3.6020973506795735, + "tokens_seen": 1148265472 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032932798395185557, + "loss": 2.9873, + "theoretical_loss": 3.6020782832280887, + "tokens_seen": 1148331008 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032931795386158475, + "loss": 2.9096, + "theoretical_loss": 3.6020592171694386, + "tokens_seen": 1148396544 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329307923771314, + "loss": 2.9625, + "theoretical_loss": 3.6020401525034424, + "tokens_seen": 1148462080 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1839880, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1416993141174316, + "objective/train/theoretical_loss": 3.6020353865545243, + "objective/train/tokens_used": 1168938464, + "theoretical_loss": 3.6020353865545243, + "tokens_seen": 1148478464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292978936810431, + "loss": 2.9242, + "theoretical_loss": 3.6020210892299183, + "tokens_seen": 1148527616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032928786359077235, + "loss": 2.9168, + "theoretical_loss": 3.6020020273486857, + "tokens_seen": 1148593152 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292778335005015, + "loss": 2.964, + "theoretical_loss": 3.6019829668595627, + "tokens_seen": 1148658688 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292678034102307, + "loss": 2.9367, + "theoretical_loss": 3.6019639077623693, + "tokens_seen": 1148724224 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292577733199599, + "loss": 2.7883, + "theoretical_loss": 3.6019448500569236, + "tokens_seen": 1148789760 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032924774322968907, + "loss": 2.9054, + "theoretical_loss": 3.6019257937430447, + "tokens_seen": 1148855296 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032923771313941825, + "loss": 2.8568, + "theoretical_loss": 3.601906738820552, + "tokens_seen": 1148920832 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292276830491475, + "loss": 2.8713, + "theoretical_loss": 3.601887685289264, + "tokens_seen": 1148986368 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003292176529588766, + "loss": 2.8037, + "theoretical_loss": 3.601868633149, + "tokens_seen": 1149051904 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032920762286860585, + "loss": 2.9479, + "theoretical_loss": 3.6018495823995798, + "tokens_seen": 1149117440 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329197592778335, + "loss": 3.0284, + "theoretical_loss": 3.6018305330408213, + "tokens_seen": 1149182976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291875626880642, + "loss": 2.8933, + "theoretical_loss": 3.6018114850725444, + "tokens_seen": 1149248512 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291775325977934, + "loss": 2.7239, + "theoretical_loss": 3.6017924384945683, + "tokens_seen": 1149314048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291675025075226, + "loss": 2.7895, + "theoretical_loss": 3.6017733933067126, + "tokens_seen": 1149379584 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032915747241725175, + "loss": 3.1153, + "theoretical_loss": 3.601754349508796, + "tokens_seen": 1149445120 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032914744232698094, + "loss": 2.987, + "theoretical_loss": 3.601735307100638, + "tokens_seen": 1149510656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291374122367101, + "loss": 2.8643, + "theoretical_loss": 3.601716266082058, + "tokens_seen": 1149576192 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032912738214643935, + "loss": 2.7992, + "theoretical_loss": 3.6016972264528753, + "tokens_seen": 1149641728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291173520561685, + "loss": 2.8764, + "theoretical_loss": 3.6016781882129094, + "tokens_seen": 1149707264 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003291073219658977, + "loss": 2.9586, + "theoretical_loss": 3.6016591513619796, + "tokens_seen": 1149772800 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003290972918756269, + "loss": 2.8819, + "theoretical_loss": 3.601640115899906, + "tokens_seen": 1149838336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003290872617853561, + "loss": 2.9246, + "theoretical_loss": 3.601621081826507, + "tokens_seen": 1149903872 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032907723169508526, + "loss": 2.8746, + "theoretical_loss": 3.6016020491416034, + "tokens_seen": 1149969408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032906720160481444, + "loss": 2.8317, + "theoretical_loss": 3.601583017845014, + "tokens_seen": 1150034944 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003290571715145436, + "loss": 2.927, + "theoretical_loss": 3.6015639879365584, + "tokens_seen": 1150100480 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1842635, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.936035394668579, + "objective/train/theoretical_loss": 3.60155923067632, + "objective/train/tokens_used": 1170576864, + "theoretical_loss": 3.60155923067632, + "tokens_seen": 1150116864 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032904714142427286, + "loss": 2.7694, + "theoretical_loss": 3.601544959416057, + "tokens_seen": 1150166016 + }, + { + "epoch": 3.04, + "learning_rate": 0.000329037111334002, + "loss": 2.8581, + "theoretical_loss": 3.6015259322833284, + "tokens_seen": 1150231552 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003290270812437312, + "loss": 2.7352, + "theoretical_loss": 3.601506906538193, + "tokens_seen": 1150297088 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032901705115346034, + "loss": 2.6617, + "theoretical_loss": 3.601487882180471, + "tokens_seen": 1150362624 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003290070210631896, + "loss": 2.7591, + "theoretical_loss": 3.601468859209981, + "tokens_seen": 1150428160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032899699097291876, + "loss": 3.0025, + "theoretical_loss": 3.601449837626544, + "tokens_seen": 1150493696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032898696088264794, + "loss": 2.8913, + "theoretical_loss": 3.6014308174299794, + "tokens_seen": 1150559232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003289769307923771, + "loss": 2.9071, + "theoretical_loss": 3.6014117986201066, + "tokens_seen": 1150624768 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003289669007021063, + "loss": 2.8988, + "theoretical_loss": 3.601392781196746, + "tokens_seen": 1150690304 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003289568706118355, + "loss": 2.9824, + "theoretical_loss": 3.601373765159718, + "tokens_seen": 1150755840 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003289468405215647, + "loss": 2.8294, + "theoretical_loss": 3.601354750508842, + "tokens_seen": 1150821376 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032893681043129385, + "loss": 2.9572, + "theoretical_loss": 3.601335737243938, + "tokens_seen": 1150886912 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003289267803410231, + "loss": 2.8447, + "theoretical_loss": 3.601316725364826, + "tokens_seen": 1150952448 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032891675025075226, + "loss": 2.7589, + "theoretical_loss": 3.6012977148713268, + "tokens_seen": 1151017984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032890672016048145, + "loss": 2.8587, + "theoretical_loss": 3.60127870576326, + "tokens_seen": 1151083520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288966900702106, + "loss": 2.8622, + "theoretical_loss": 3.6012596980404457, + "tokens_seen": 1151149056 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288866599799398, + "loss": 2.9251, + "theoretical_loss": 3.6012406917027038, + "tokens_seen": 1151214592 + }, + { + "epoch": 3.04, + "learning_rate": 0.000328876629889669, + "loss": 2.7109, + "theoretical_loss": 3.6012216867498554, + "tokens_seen": 1151280128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288665997993982, + "loss": 3.0129, + "theoretical_loss": 3.6012026831817203, + "tokens_seen": 1151345664 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032885656970912735, + "loss": 2.9143, + "theoretical_loss": 3.601183680998119, + "tokens_seen": 1151411200 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288465396188566, + "loss": 2.8585, + "theoretical_loss": 3.6011646801988713, + "tokens_seen": 1151476736 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288365095285857, + "loss": 2.8612, + "theoretical_loss": 3.601145680783798, + "tokens_seen": 1151542272 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032882647943831495, + "loss": 2.7934, + "theoretical_loss": 3.601126682752719, + "tokens_seen": 1151607808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288164493480442, + "loss": 2.9443, + "theoretical_loss": 3.601107686105456, + "tokens_seen": 1151673344 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003288064192577733, + "loss": 2.9326, + "theoretical_loss": 3.601088690841828, + "tokens_seen": 1151738880 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1845503, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4826033115386963, + "objective/train/theoretical_loss": 3.6010839422420933, + "objective/train/tokens_used": 1172215264, + "theoretical_loss": 3.6010839422420933, + "tokens_seen": 1151755264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032879638916750255, + "loss": 2.7083, + "theoretical_loss": 3.6010696969616562, + "tokens_seen": 1151804416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287863590772317, + "loss": 2.8849, + "theoretical_loss": 3.601050704464761, + "tokens_seen": 1151869952 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287763289869609, + "loss": 2.9228, + "theoretical_loss": 3.601031713350963, + "tokens_seen": 1151935488 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287662988966901, + "loss": 2.8695, + "theoretical_loss": 3.601012723620083, + "tokens_seen": 1152001024 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032875626880641927, + "loss": 2.87, + "theoretical_loss": 3.600993735271941, + "tokens_seen": 1152066560 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032874623871614845, + "loss": 2.9153, + "theoretical_loss": 3.600974748306359, + "tokens_seen": 1152132096 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287362086258777, + "loss": 2.9159, + "theoretical_loss": 3.6009557627231557, + "tokens_seen": 1152197632 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287261785356068, + "loss": 2.8813, + "theoretical_loss": 3.6009367785221533, + "tokens_seen": 1152263168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032871614844533605, + "loss": 2.819, + "theoretical_loss": 3.6009177957031726, + "tokens_seen": 1152328704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003287061183550652, + "loss": 2.8199, + "theoretical_loss": 3.6008988142660336, + "tokens_seen": 1152394240 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286960882647944, + "loss": 2.8728, + "theoretical_loss": 3.6008798342105575, + "tokens_seen": 1152459776 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286860581745236, + "loss": 2.9123, + "theoretical_loss": 3.6008608555365655, + "tokens_seen": 1152525312 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286760280842528, + "loss": 2.9254, + "theoretical_loss": 3.600841878243878, + "tokens_seen": 1152590848 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032866599799398195, + "loss": 2.747, + "theoretical_loss": 3.6008229023323164, + "tokens_seen": 1152656384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032865596790371114, + "loss": 2.8107, + "theoretical_loss": 3.600803927801701, + "tokens_seen": 1152721920 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286459378134403, + "loss": 2.7212, + "theoretical_loss": 3.6007849546518536, + "tokens_seen": 1152787456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032863590772316955, + "loss": 2.8636, + "theoretical_loss": 3.6007659828825944, + "tokens_seen": 1152852992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286258776328987, + "loss": 2.9106, + "theoretical_loss": 3.600747012493745, + "tokens_seen": 1152918528 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286158475426279, + "loss": 2.8194, + "theoretical_loss": 3.6007280434851268, + "tokens_seen": 1152984064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003286058174523571, + "loss": 2.9149, + "theoretical_loss": 3.6007090758565603, + "tokens_seen": 1153049600 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003285957873620863, + "loss": 2.8834, + "theoretical_loss": 3.600690109607867, + "tokens_seen": 1153115136 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032858575727181546, + "loss": 2.821, + "theoretical_loss": 3.6006711447388673, + "tokens_seen": 1153180672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032857572718154464, + "loss": 2.9412, + "theoretical_loss": 3.6006521812493837, + "tokens_seen": 1153246208 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003285656970912738, + "loss": 2.8858, + "theoretical_loss": 3.600633219139237, + "tokens_seen": 1153311744 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032855566700100306, + "loss": 2.7785, + "theoretical_loss": 3.600614258408249, + "tokens_seen": 1153377280 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1847928, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8619863986968994, + "objective/train/theoretical_loss": 3.600609518440974, + "objective/train/tokens_used": 1173853664, + "theoretical_loss": 3.600609518440974, + "tokens_seen": 1153393664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003285456369107322, + "loss": 2.8523, + "theoretical_loss": 3.6005952990562395, + "tokens_seen": 1153442816 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003285356068204614, + "loss": 2.7437, + "theoretical_loss": 3.6005763410830305, + "tokens_seen": 1153508352 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032852557673019054, + "loss": 2.7907, + "theoretical_loss": 3.600557384488445, + "tokens_seen": 1153573888 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003285155466399198, + "loss": 2.9167, + "theoretical_loss": 3.6005384292723024, + "tokens_seen": 1153639424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032850551654964896, + "loss": 2.9433, + "theoretical_loss": 3.6005194754344245, + "tokens_seen": 1153704960 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032849548645937814, + "loss": 2.7943, + "theoretical_loss": 3.6005005229746336, + "tokens_seen": 1153770496 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284854563691073, + "loss": 2.9173, + "theoretical_loss": 3.600481571892751, + "tokens_seen": 1153836032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284754262788365, + "loss": 2.7356, + "theoretical_loss": 3.600462622188598, + "tokens_seen": 1153901568 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284653961885657, + "loss": 2.8972, + "theoretical_loss": 3.6004436738619963, + "tokens_seen": 1153967104 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284553660982949, + "loss": 2.9157, + "theoretical_loss": 3.6004247269127676, + "tokens_seen": 1154032640 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032844533600802405, + "loss": 2.9331, + "theoretical_loss": 3.6004057813407337, + "tokens_seen": 1154098176 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284353059177533, + "loss": 2.8568, + "theoretical_loss": 3.6003868371457157, + "tokens_seen": 1154163712 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032842527582748246, + "loss": 2.762, + "theoretical_loss": 3.600367894327536, + "tokens_seen": 1154229248 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032841524573721165, + "loss": 2.7667, + "theoretical_loss": 3.600348952886016, + "tokens_seen": 1154294784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003284052156469408, + "loss": 2.9111, + "theoretical_loss": 3.6003300128209776, + "tokens_seen": 1154360320 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032839518555667, + "loss": 2.9451, + "theoretical_loss": 3.600311074132243, + "tokens_seen": 1154425856 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283851554663992, + "loss": 2.8592, + "theoretical_loss": 3.6002921368196334, + "tokens_seen": 1154491392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283751253761284, + "loss": 2.7611, + "theoretical_loss": 3.6002732008829708, + "tokens_seen": 1154556928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283751253761284, + "loss": 2.8569, + "theoretical_loss": 3.600254266322078, + "tokens_seen": 1154622464 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032836509528585755, + "loss": 2.8705, + "theoretical_loss": 3.6002353331367756, + "tokens_seen": 1154688000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283550651955868, + "loss": 2.9179, + "theoretical_loss": 3.600216401326887, + "tokens_seen": 1154753536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283450351053159, + "loss": 2.8166, + "theoretical_loss": 3.6001974708922324, + "tokens_seen": 1154819072 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032833500501504515, + "loss": 2.9646, + "theoretical_loss": 3.600178541832636, + "tokens_seen": 1154884608 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032832497492477433, + "loss": 2.8255, + "theoretical_loss": 3.600159614147919, + "tokens_seen": 1154950144 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283149448345035, + "loss": 2.9728, + "theoretical_loss": 3.6001406878379028, + "tokens_seen": 1155015680 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1850685, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8888022899627686, + "objective/train/theoretical_loss": 3.6001359564751754, + "objective/train/tokens_used": 1175492064, + "theoretical_loss": 3.6001359564751754, + "tokens_seen": 1155032064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003283049147442327, + "loss": 2.8993, + "theoretical_loss": 3.6001217629024103, + "tokens_seen": 1155081216 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003282948846539619, + "loss": 2.7378, + "theoretical_loss": 3.6001028393412637, + "tokens_seen": 1155146752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032828485456369105, + "loss": 2.7716, + "theoretical_loss": 3.6000839171542856, + "tokens_seen": 1155212288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003282748244734203, + "loss": 2.8199, + "theoretical_loss": 3.600064996341297, + "tokens_seen": 1155277824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003282647943831494, + "loss": 2.5756, + "theoretical_loss": 3.6000460769021214, + "tokens_seen": 1155343360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032825476429287865, + "loss": 2.8831, + "theoretical_loss": 3.6000271588365806, + "tokens_seen": 1155408896 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032824473420260783, + "loss": 2.7283, + "theoretical_loss": 3.600008242144497, + "tokens_seen": 1155474432 + }, + { + "epoch": 3.04, + "learning_rate": 0.000328234704112337, + "loss": 2.907, + "theoretical_loss": 3.5999893268256935, + "tokens_seen": 1155539968 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003282246740220662, + "loss": 2.7429, + "theoretical_loss": 3.5999704128799914, + "tokens_seen": 1155605504 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003282146439317954, + "loss": 2.9049, + "theoretical_loss": 3.5999515003072142, + "tokens_seen": 1155671040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032820461384152456, + "loss": 2.8873, + "theoretical_loss": 3.5999325891071843, + "tokens_seen": 1155736576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281945837512538, + "loss": 2.9153, + "theoretical_loss": 3.599913679279724, + "tokens_seen": 1155802112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281845536609829, + "loss": 2.6578, + "theoretical_loss": 3.5998947708246556, + "tokens_seen": 1155867648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032817452357071215, + "loss": 2.8948, + "theoretical_loss": 3.599875863741802, + "tokens_seen": 1155933184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281644934804413, + "loss": 2.9088, + "theoretical_loss": 3.5998569580309856, + "tokens_seen": 1155998720 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281544633901705, + "loss": 2.8689, + "theoretical_loss": 3.5998380536920296, + "tokens_seen": 1156064256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281444332998997, + "loss": 2.9137, + "theoretical_loss": 3.5998191507247563, + "tokens_seen": 1156129792 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281344032096289, + "loss": 2.8747, + "theoretical_loss": 3.5998002491289878, + "tokens_seen": 1156195328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032812437311935806, + "loss": 2.8807, + "theoretical_loss": 3.599781348904548, + "tokens_seen": 1156260864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281143430290873, + "loss": 2.9663, + "theoretical_loss": 3.599762450051259, + "tokens_seen": 1156326400 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003281043129388164, + "loss": 2.932, + "theoretical_loss": 3.5997435525689436, + "tokens_seen": 1156391936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032809428284854566, + "loss": 2.9251, + "theoretical_loss": 3.5997246564574255, + "tokens_seen": 1156457472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003280842527582748, + "loss": 3.0037, + "theoretical_loss": 3.5997057617165265, + "tokens_seen": 1156523008 + }, + { + "epoch": 3.04, + "learning_rate": 0.000328074222668004, + "loss": 2.8274, + "theoretical_loss": 3.5996868683460708, + "tokens_seen": 1156588544 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032806419257773326, + "loss": 2.9014, + "theoretical_loss": 3.5996679763458794, + "tokens_seen": 1156654080 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1853532, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.160818099975586, + "objective/train/theoretical_loss": 3.599663253559915, + "objective/train/tokens_used": 1177130464, + "theoretical_loss": 3.599663253559915, + "tokens_seen": 1156670464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003280541624874624, + "loss": 2.9899, + "theoretical_loss": 3.599649085715777, + "tokens_seen": 1156719616 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003280441323971916, + "loss": 2.9794, + "theoretical_loss": 3.5996301964555864, + "tokens_seen": 1156785152 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032803410230692074, + "loss": 2.9816, + "theoretical_loss": 3.5996113085651293, + "tokens_seen": 1156850688 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032802407221665, + "loss": 2.6794, + "theoretical_loss": 3.5995924220442306, + "tokens_seen": 1156916224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032801404212637916, + "loss": 2.8302, + "theoretical_loss": 3.5995735368927124, + "tokens_seen": 1156981760 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032800401203610834, + "loss": 3.037, + "theoretical_loss": 3.599554653110398, + "tokens_seen": 1157047296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279939819458375, + "loss": 2.7636, + "theoretical_loss": 3.599535770697111, + "tokens_seen": 1157112832 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279839518555667, + "loss": 2.8607, + "theoretical_loss": 3.5995168896526737, + "tokens_seen": 1157178368 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279739217652959, + "loss": 3.0569, + "theoretical_loss": 3.5994980099769105, + "tokens_seen": 1157243904 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279638916750251, + "loss": 2.8467, + "theoretical_loss": 3.5994791316696437, + "tokens_seen": 1157309440 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032795386158475425, + "loss": 2.6778, + "theoretical_loss": 3.599460254730697, + "tokens_seen": 1157374976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279438314944835, + "loss": 2.8693, + "theoretical_loss": 3.5994413791598943, + "tokens_seen": 1157440512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032793380140421266, + "loss": 2.7212, + "theoretical_loss": 3.599422504957058, + "tokens_seen": 1157506048 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032792377131394185, + "loss": 2.8075, + "theoretical_loss": 3.599403632122012, + "tokens_seen": 1157571584 + }, + { + "epoch": 3.04, + "learning_rate": 0.000327913741223671, + "loss": 3.0511, + "theoretical_loss": 3.59938476065458, + "tokens_seen": 1157637120 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003279037111334002, + "loss": 3.0071, + "theoretical_loss": 3.5993658905545844, + "tokens_seen": 1157702656 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003278936810431294, + "loss": 2.7803, + "theoretical_loss": 3.599347021821851, + "tokens_seen": 1157768192 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003278836509528586, + "loss": 2.9479, + "theoretical_loss": 3.5993281544562006, + "tokens_seen": 1157833728 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032787362086258775, + "loss": 2.6727, + "theoretical_loss": 3.5993092884574587, + "tokens_seen": 1157899264 + }, + { + "epoch": 3.04, + "learning_rate": 0.000327863590772317, + "loss": 2.8311, + "theoretical_loss": 3.5992904238254475, + "tokens_seen": 1157964800 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003278535606820461, + "loss": 2.8862, + "theoretical_loss": 3.599271560559992, + "tokens_seen": 1158030336 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032784353059177535, + "loss": 2.9273, + "theoretical_loss": 3.5992526986609157, + "tokens_seen": 1158095872 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032783350050150453, + "loss": 2.8612, + "theoretical_loss": 3.5992338381280415, + "tokens_seen": 1158161408 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003278234704112337, + "loss": 2.7581, + "theoretical_loss": 3.5992149789611934, + "tokens_seen": 1158226944 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003278134403209629, + "loss": 2.9052, + "theoretical_loss": 3.599196121160195, + "tokens_seen": 1158292480 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1856400, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8356711864471436, + "objective/train/theoretical_loss": 3.599191406923339, + "objective/train/tokens_used": 1178768864, + "theoretical_loss": 3.599191406923339, + "tokens_seen": 1158308864 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032780341023069213, + "loss": 2.9255, + "theoretical_loss": 3.5991772647248714, + "tokens_seen": 1158358016 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032779338014042125, + "loss": 2.9189, + "theoretical_loss": 3.5991584096550446, + "tokens_seen": 1158423552 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003277833500501505, + "loss": 2.7531, + "theoretical_loss": 3.5991395559505395, + "tokens_seen": 1158489088 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003277733199598796, + "loss": 2.9067, + "theoretical_loss": 3.59912070361118, + "tokens_seen": 1158554624 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032776328986960885, + "loss": 2.8012, + "theoretical_loss": 3.5991018526367897, + "tokens_seen": 1158620160 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032775325977933803, + "loss": 2.8997, + "theoretical_loss": 3.599083003027193, + "tokens_seen": 1158685696 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003277432296890672, + "loss": 2.8206, + "theoretical_loss": 3.599064154782214, + "tokens_seen": 1158751232 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003277331995987964, + "loss": 2.8115, + "theoretical_loss": 3.5990453079016755, + "tokens_seen": 1158816768 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003277231695085256, + "loss": 2.8795, + "theoretical_loss": 3.599026462385403, + "tokens_seen": 1158882304 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032771313941825476, + "loss": 2.9624, + "theoretical_loss": 3.5990076182332205, + "tokens_seen": 1158947840 + }, + { + "epoch": 3.04, + "learning_rate": 0.000327703109327984, + "loss": 2.9255, + "theoretical_loss": 3.598988775444951, + "tokens_seen": 1159013376 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276930792377131, + "loss": 2.7319, + "theoretical_loss": 3.5989699340204195, + "tokens_seen": 1159078912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032768304914744235, + "loss": 2.8683, + "theoretical_loss": 3.5989510939594505, + "tokens_seen": 1159144448 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276730190571715, + "loss": 2.7682, + "theoretical_loss": 3.598932255261867, + "tokens_seen": 1159209984 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276629889669007, + "loss": 2.9808, + "theoretical_loss": 3.598913417927495, + "tokens_seen": 1159275520 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276529588766299, + "loss": 2.9183, + "theoretical_loss": 3.5988945819561575, + "tokens_seen": 1159341056 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276429287863591, + "loss": 2.9355, + "theoretical_loss": 3.5988757473476793, + "tokens_seen": 1159406592 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032763289869608826, + "loss": 2.8656, + "theoretical_loss": 3.5988569141018845, + "tokens_seen": 1159472128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276228686058175, + "loss": 2.8447, + "theoretical_loss": 3.598838082218598, + "tokens_seen": 1159537664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003276128385155466, + "loss": 2.8904, + "theoretical_loss": 3.5988192516976434, + "tokens_seen": 1159603200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032760280842527586, + "loss": 2.9111, + "theoretical_loss": 3.598800422538846, + "tokens_seen": 1159668736 + }, + { + "epoch": 3.04, + "learning_rate": 0.000327592778335005, + "loss": 2.9156, + "theoretical_loss": 3.5987815947420296, + "tokens_seen": 1159734272 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275827482447342, + "loss": 2.7985, + "theoretical_loss": 3.5987627683070196, + "tokens_seen": 1159799808 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275727181544634, + "loss": 2.8386, + "theoretical_loss": 3.5987439432336394, + "tokens_seen": 1159865344 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275626880641926, + "loss": 2.8405, + "theoretical_loss": 3.598725119521715, + "tokens_seen": 1159930880 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1857916, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.819260835647583, + "objective/train/theoretical_loss": 3.598720413806441, + "objective/train/tokens_used": 1180407264, + "theoretical_loss": 3.598720413806441, + "tokens_seen": 1159947264 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032755265797392176, + "loss": 2.7786, + "theoretical_loss": 3.59870629717107, + "tokens_seen": 1159996416 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032754262788365094, + "loss": 2.8751, + "theoretical_loss": 3.598687476181529, + "tokens_seen": 1160061952 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275325977933801, + "loss": 3.1131, + "theoretical_loss": 3.5986686565529173, + "tokens_seen": 1160127488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032752256770310936, + "loss": 2.863, + "theoretical_loss": 3.598649838285059, + "tokens_seen": 1160193024 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275125376128385, + "loss": 2.9063, + "theoretical_loss": 3.5986310213777797, + "tokens_seen": 1160258560 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003275025075225677, + "loss": 2.9027, + "theoretical_loss": 3.598612205830903, + "tokens_seen": 1160324096 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032749247743229685, + "loss": 2.9109, + "theoretical_loss": 3.598593391644255, + "tokens_seen": 1160389632 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003274824473420261, + "loss": 2.8418, + "theoretical_loss": 3.5985745788176597, + "tokens_seen": 1160455168 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032747241725175527, + "loss": 2.7765, + "theoretical_loss": 3.598555767350942, + "tokens_seen": 1160520704 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032746238716148445, + "loss": 2.7975, + "theoretical_loss": 3.5985369572439274, + "tokens_seen": 1160586240 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032745235707121363, + "loss": 2.683, + "theoretical_loss": 3.5985181484964404, + "tokens_seen": 1160651776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032744232698094286, + "loss": 2.898, + "theoretical_loss": 3.5984993411083055, + "tokens_seen": 1160717312 + }, + { + "epoch": 3.04, + "learning_rate": 0.000327432296890672, + "loss": 2.6954, + "theoretical_loss": 3.598480535079349, + "tokens_seen": 1160782848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003274222668004012, + "loss": 2.8229, + "theoretical_loss": 3.598461730409395, + "tokens_seen": 1160848384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032741223671013035, + "loss": 2.949, + "theoretical_loss": 3.5984429270982687, + "tokens_seen": 1160913920 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003274022066198596, + "loss": 2.9408, + "theoretical_loss": 3.5984241251457956, + "tokens_seen": 1160979456 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032739217652958877, + "loss": 2.9185, + "theoretical_loss": 3.5984053245518, + "tokens_seen": 1161044992 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032738214643931795, + "loss": 2.801, + "theoretical_loss": 3.5983865253161085, + "tokens_seen": 1161110528 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032737211634904713, + "loss": 2.862, + "theoretical_loss": 3.5983677274385446, + "tokens_seen": 1161176064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003273620862587763, + "loss": 2.9319, + "theoretical_loss": 3.5983489309189345, + "tokens_seen": 1161241600 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003273520561685055, + "loss": 2.8548, + "theoretical_loss": 3.598330135757103, + "tokens_seen": 1161307136 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032734202607823473, + "loss": 2.9083, + "theoretical_loss": 3.5983113419528765, + "tokens_seen": 1161372672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032733199598796386, + "loss": 2.9181, + "theoretical_loss": 3.5982925495060796, + "tokens_seen": 1161438208 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003273219658976931, + "loss": 2.9713, + "theoretical_loss": 3.5982737584165374, + "tokens_seen": 1161503744 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032731193580742233, + "loss": 2.9378, + "theoretical_loss": 3.598254968684076, + "tokens_seen": 1161569280 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1860918, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8182618618011475, + "objective/train/theoretical_loss": 3.598250271462984, + "objective/train/tokens_used": 1182045664, + "theoretical_loss": 3.598250271462984, + "tokens_seen": 1161585664 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032730190571715145, + "loss": 2.8377, + "theoretical_loss": 3.5982361803085197, + "tokens_seen": 1161634816 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272918756268807, + "loss": 2.95, + "theoretical_loss": 3.598217393289695, + "tokens_seen": 1161700352 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272818455366098, + "loss": 2.8503, + "theoretical_loss": 3.598198607627427, + "tokens_seen": 1161765888 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032727181544633905, + "loss": 2.8144, + "theoretical_loss": 3.598179823321541, + "tokens_seen": 1161831424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032726178535606823, + "loss": 2.8545, + "theoretical_loss": 3.5981610403718634, + "tokens_seen": 1161896960 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272517552657974, + "loss": 2.7787, + "theoretical_loss": 3.5981422587782186, + "tokens_seen": 1161962496 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272417251755266, + "loss": 2.9619, + "theoretical_loss": 3.5981234785404337, + "tokens_seen": 1162028032 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272316950852558, + "loss": 2.8021, + "theoretical_loss": 3.5981046996583332, + "tokens_seen": 1162093568 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032722166499498496, + "loss": 2.9426, + "theoretical_loss": 3.5980859221317427, + "tokens_seen": 1162159104 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272116349047142, + "loss": 2.9816, + "theoretical_loss": 3.598067145960489, + "tokens_seen": 1162224640 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003272016048144433, + "loss": 2.7411, + "theoretical_loss": 3.5980483711443965, + "tokens_seen": 1162290176 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032719157472417256, + "loss": 2.9059, + "theoretical_loss": 3.5980295976832926, + "tokens_seen": 1162355712 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271815446339017, + "loss": 3.0301, + "theoretical_loss": 3.5980108255770014, + "tokens_seen": 1162421248 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271715145436309, + "loss": 2.8636, + "theoretical_loss": 3.59799205482535, + "tokens_seen": 1162486784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271614844533601, + "loss": 2.8251, + "theoretical_loss": 3.597973285428164, + "tokens_seen": 1162552320 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271514543630893, + "loss": 2.9234, + "theoretical_loss": 3.5979545173852685, + "tokens_seen": 1162617856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032714142427281846, + "loss": 2.8731, + "theoretical_loss": 3.597935750696491, + "tokens_seen": 1162683392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271313941825477, + "loss": 2.8747, + "theoretical_loss": 3.597916985361656, + "tokens_seen": 1162748928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271213640922768, + "loss": 2.8847, + "theoretical_loss": 3.59789822138059, + "tokens_seen": 1162814464 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032711133400200606, + "loss": 2.8183, + "theoretical_loss": 3.5978794587531198, + "tokens_seen": 1162880000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003271013039117352, + "loss": 2.7568, + "theoretical_loss": 3.597860697479071, + "tokens_seen": 1162945536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270912738214644, + "loss": 2.9016, + "theoretical_loss": 3.5978419375582686, + "tokens_seen": 1163011072 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270812437311936, + "loss": 2.8018, + "theoretical_loss": 3.59782317899054, + "tokens_seen": 1163076608 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270712136409228, + "loss": 2.8759, + "theoretical_loss": 3.5978044217757112, + "tokens_seen": 1163142144 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032706118355065196, + "loss": 2.8228, + "theoretical_loss": 3.5977856659136087, + "tokens_seen": 1163207680 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1864384, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.058995246887207, + "objective/train/theoretical_loss": 3.597780977159426, + "objective/train/tokens_used": 1183684064, + "theoretical_loss": 3.597780977159426, + "tokens_seen": 1163224064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032705115346038114, + "loss": 2.8366, + "theoretical_loss": 3.597766911404058, + "tokens_seen": 1163273216 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270411233701103, + "loss": 2.8351, + "theoretical_loss": 3.597748158246886, + "tokens_seen": 1163338752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032703109327983956, + "loss": 2.8577, + "theoretical_loss": 3.5977294064419176, + "tokens_seen": 1163404288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270210631895687, + "loss": 2.9466, + "theoretical_loss": 3.5977106559889815, + "tokens_seen": 1163469824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003270110330992979, + "loss": 2.8387, + "theoretical_loss": 3.5976919068879023, + "tokens_seen": 1163535360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032700100300902705, + "loss": 2.918, + "theoretical_loss": 3.597673159138507, + "tokens_seen": 1163600896 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003269909729187563, + "loss": 2.8449, + "theoretical_loss": 3.5976544127406216, + "tokens_seen": 1163666432 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032698094282848547, + "loss": 2.8843, + "theoretical_loss": 3.5976356676940733, + "tokens_seen": 1163731968 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032697091273821465, + "loss": 2.699, + "theoretical_loss": 3.5976169239986877, + "tokens_seen": 1163797504 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032696088264794383, + "loss": 2.9776, + "theoretical_loss": 3.597598181654292, + "tokens_seen": 1163863040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032695085255767306, + "loss": 2.9837, + "theoretical_loss": 3.5975794406607133, + "tokens_seen": 1163928576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003269408224674022, + "loss": 2.8828, + "theoretical_loss": 3.597560701017777, + "tokens_seen": 1163994112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003269307923771314, + "loss": 2.9618, + "theoretical_loss": 3.59754196272531, + "tokens_seen": 1164059648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032692076228686055, + "loss": 2.8505, + "theoretical_loss": 3.5975232257831395, + "tokens_seen": 1164125184 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003269107321965898, + "loss": 2.852, + "theoretical_loss": 3.5975044901910915, + "tokens_seen": 1164190720 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032690070210631897, + "loss": 2.9178, + "theoretical_loss": 3.597485755948993, + "tokens_seen": 1164256256 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032689067201604815, + "loss": 2.9123, + "theoretical_loss": 3.597467023056671, + "tokens_seen": 1164321792 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032688064192577733, + "loss": 2.8015, + "theoretical_loss": 3.597448291513952, + "tokens_seen": 1164387328 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003268706118355065, + "loss": 2.954, + "theoretical_loss": 3.5974295613206633, + "tokens_seen": 1164452864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003268605817452357, + "loss": 2.8235, + "theoretical_loss": 3.5974108324766307, + "tokens_seen": 1164518400 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032685055165496493, + "loss": 2.8273, + "theoretical_loss": 3.597392104981682, + "tokens_seen": 1164583936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032684052156469406, + "loss": 2.9041, + "theoretical_loss": 3.5973733788356443, + "tokens_seen": 1164649472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003268304914744233, + "loss": 2.882, + "theoretical_loss": 3.5973546540383436, + "tokens_seen": 1164715008 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003268204613841524, + "loss": 2.8342, + "theoretical_loss": 3.5973359305896078, + "tokens_seen": 1164780544 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032681043129388165, + "loss": 2.9397, + "theoretical_loss": 3.5973172084892635, + "tokens_seen": 1164846080 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1865774, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0422041416168213, + "objective/train/theoretical_loss": 3.597312528174843, + "objective/train/tokens_used": 1185322464, + "theoretical_loss": 3.597312528174843, + "tokens_seen": 1164862464 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032680040120361084, + "loss": 2.9042, + "theoretical_loss": 3.5972984877371372, + "tokens_seen": 1164911616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032679037111334, + "loss": 2.7628, + "theoretical_loss": 3.597279768333057, + "tokens_seen": 1164977152 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003267803410230692, + "loss": 2.9098, + "theoretical_loss": 3.597261050276849, + "tokens_seen": 1165042688 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032677031093279843, + "loss": 2.9331, + "theoretical_loss": 3.597242333568341, + "tokens_seen": 1165108224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032676028084252756, + "loss": 3.0056, + "theoretical_loss": 3.5972236182073605, + "tokens_seen": 1165173760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003267502507522568, + "loss": 2.7495, + "theoretical_loss": 3.5972049041937337, + "tokens_seen": 1165239296 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003267402206619859, + "loss": 3.0019, + "theoretical_loss": 3.597186191527288, + "tokens_seen": 1165304832 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032673019057171516, + "loss": 2.8192, + "theoretical_loss": 3.5971674802078515, + "tokens_seen": 1165370368 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032672016048144434, + "loss": 3.0442, + "theoretical_loss": 3.597148770235251, + "tokens_seen": 1165435904 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003267101303911735, + "loss": 2.9849, + "theoretical_loss": 3.5971300616093136, + "tokens_seen": 1165501440 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003267001003009027, + "loss": 2.8258, + "theoretical_loss": 3.5971113543298667, + "tokens_seen": 1165566976 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003266900702106319, + "loss": 2.7861, + "theoretical_loss": 3.5970926483967385, + "tokens_seen": 1165632512 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032668004012036106, + "loss": 2.8133, + "theoretical_loss": 3.597073943809755, + "tokens_seen": 1165698048 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003266700100300903, + "loss": 2.6909, + "theoretical_loss": 3.597055240568745, + "tokens_seen": 1165763584 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003266599799398194, + "loss": 2.8826, + "theoretical_loss": 3.5970365386735352, + "tokens_seen": 1165829120 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032664994984954866, + "loss": 2.8405, + "theoretical_loss": 3.5970178381239535, + "tokens_seen": 1165894656 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032663991975927784, + "loss": 2.8524, + "theoretical_loss": 3.596999138919827, + "tokens_seen": 1165960192 + }, + { + "epoch": 3.04, + "learning_rate": 0.000326629889669007, + "loss": 2.9122, + "theoretical_loss": 3.596980441060984, + "tokens_seen": 1166025728 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003266198595787362, + "loss": 2.7682, + "theoretical_loss": 3.5969617445472517, + "tokens_seen": 1166091264 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003266098294884654, + "loss": 2.9731, + "theoretical_loss": 3.5969430493784573, + "tokens_seen": 1166156800 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032659979939819457, + "loss": 2.9765, + "theoretical_loss": 3.596924355554429, + "tokens_seen": 1166222336 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003265897693079238, + "loss": 2.7428, + "theoretical_loss": 3.5969056630749945, + "tokens_seen": 1166287872 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032657973921765293, + "loss": 2.8521, + "theoretical_loss": 3.5968869719399814, + "tokens_seen": 1166353408 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032656970912738216, + "loss": 2.9777, + "theoretical_loss": 3.5968682821492175, + "tokens_seen": 1166418944 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032655967903711135, + "loss": 2.7993, + "theoretical_loss": 3.596849593702531, + "tokens_seen": 1166484480 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1868615, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8111910820007324, + "objective/train/theoretical_loss": 3.596844921800851, + "objective/train/tokens_used": 1186960864, + "theoretical_loss": 3.596844921800851, + "tokens_seen": 1166500864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003265496489468405, + "loss": 2.9648, + "theoretical_loss": 3.596830906599749, + "tokens_seen": 1166550016 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032653961885656976, + "loss": 2.8653, + "theoretical_loss": 3.5968122208407, + "tokens_seen": 1166615552 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003265295887662989, + "loss": 2.8723, + "theoretical_loss": 3.5967935364252113, + "tokens_seen": 1166681088 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003265195586760281, + "loss": 2.9545, + "theoretical_loss": 3.596774853353111, + "tokens_seen": 1166746624 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032650952858575725, + "loss": 2.8017, + "theoretical_loss": 3.596756171624228, + "tokens_seen": 1166812160 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003264994984954865, + "loss": 2.7064, + "theoretical_loss": 3.5967374912383887, + "tokens_seen": 1166877696 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032648946840521567, + "loss": 2.8753, + "theoretical_loss": 3.596718812195422, + "tokens_seen": 1166943232 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032647943831494485, + "loss": 2.8167, + "theoretical_loss": 3.5967001344951566, + "tokens_seen": 1167008768 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032646940822467403, + "loss": 2.7345, + "theoretical_loss": 3.5966814581374194, + "tokens_seen": 1167074304 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032645937813440326, + "loss": 2.9034, + "theoretical_loss": 3.5966627831220395, + "tokens_seen": 1167139840 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003264493480441324, + "loss": 3.0372, + "theoretical_loss": 3.5966441094488437, + "tokens_seen": 1167205376 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003264393179538616, + "loss": 2.8216, + "theoretical_loss": 3.5966254371176616, + "tokens_seen": 1167270912 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032642928786359075, + "loss": 2.9452, + "theoretical_loss": 3.5966067661283208, + "tokens_seen": 1167336448 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032641925777332, + "loss": 2.9853, + "theoretical_loss": 3.5965880964806494, + "tokens_seen": 1167401984 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032640922768304917, + "loss": 3.0064, + "theoretical_loss": 3.596569428174476, + "tokens_seen": 1167467520 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032639919759277835, + "loss": 2.7093, + "theoretical_loss": 3.5965507612096292, + "tokens_seen": 1167533056 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032638916750250753, + "loss": 2.8884, + "theoretical_loss": 3.5965320955859363, + "tokens_seen": 1167598592 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003263791374122367, + "loss": 2.8693, + "theoretical_loss": 3.5965134313032268, + "tokens_seen": 1167664128 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003263691073219659, + "loss": 2.7697, + "theoretical_loss": 3.596494768361328, + "tokens_seen": 1167729664 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032635907723169513, + "loss": 2.8985, + "theoretical_loss": 3.5964761067600692, + "tokens_seen": 1167795200 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032634904714142426, + "loss": 2.8131, + "theoretical_loss": 3.5964574464992793, + "tokens_seen": 1167860736 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003263390170511535, + "loss": 3.0135, + "theoretical_loss": 3.5964387875787853, + "tokens_seen": 1167926272 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003263289869608826, + "loss": 2.8951, + "theoretical_loss": 3.5964201299984166, + "tokens_seen": 1167991808 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032631895687061185, + "loss": 3.0105, + "theoretical_loss": 3.5964014737580023, + "tokens_seen": 1168057344 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032630892678034104, + "loss": 2.9075, + "theoretical_loss": 3.5963828188573697, + "tokens_seen": 1168122880 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1871360, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.10221004486084, + "objective/train/theoretical_loss": 3.596378155341533, + "objective/train/tokens_used": 1188599264, + "theoretical_loss": 3.596378155341533, + "tokens_seen": 1168139264 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003262988966900702, + "loss": 2.8773, + "theoretical_loss": 3.5963641652963485, + "tokens_seen": 1168188416 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003262888665997994, + "loss": 2.815, + "theoretical_loss": 3.596345513074767, + "tokens_seen": 1168253952 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032627883650952863, + "loss": 3.0715, + "theoretical_loss": 3.596326862192454, + "tokens_seen": 1168319488 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032626880641925776, + "loss": 3.1381, + "theoretical_loss": 3.5963082126492383, + "tokens_seen": 1168385024 + }, + { + "epoch": 3.04, + "learning_rate": 0.000326258776328987, + "loss": 2.9446, + "theoretical_loss": 3.596289564444948, + "tokens_seen": 1168450560 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003262487462387161, + "loss": 2.7013, + "theoretical_loss": 3.596270917579412, + "tokens_seen": 1168516096 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032623871614844536, + "loss": 2.7835, + "theoretical_loss": 3.5962522720524603, + "tokens_seen": 1168581632 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032622868605817454, + "loss": 2.7678, + "theoretical_loss": 3.5962336278639206, + "tokens_seen": 1168647168 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003262186559679037, + "loss": 2.935, + "theoretical_loss": 3.5962149850136216, + "tokens_seen": 1168712704 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003262086258776329, + "loss": 2.8234, + "theoretical_loss": 3.5961963435013935, + "tokens_seen": 1168778240 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261985957873621, + "loss": 2.8474, + "theoretical_loss": 3.596177703327064, + "tokens_seen": 1168843776 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032618856569709126, + "loss": 2.8713, + "theoretical_loss": 3.5961590644904624, + "tokens_seen": 1168909312 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261785356068205, + "loss": 2.7543, + "theoretical_loss": 3.596140426991418, + "tokens_seen": 1168974848 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261685055165496, + "loss": 2.91, + "theoretical_loss": 3.596121790829759, + "tokens_seen": 1169040384 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032615847542627886, + "loss": 2.7786, + "theoretical_loss": 3.596103156005316, + "tokens_seen": 1169105920 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032614844533600804, + "loss": 2.802, + "theoretical_loss": 3.5960845225179168, + "tokens_seen": 1169171456 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261384152457372, + "loss": 2.828, + "theoretical_loss": 3.5960658903673908, + "tokens_seen": 1169236992 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261283851554664, + "loss": 2.9427, + "theoretical_loss": 3.5960472595535675, + "tokens_seen": 1169302528 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003261183550651956, + "loss": 2.7747, + "theoretical_loss": 3.596028630076276, + "tokens_seen": 1169368064 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032610832497492477, + "loss": 2.9529, + "theoretical_loss": 3.5960100019353454, + "tokens_seen": 1169433600 + }, + { + "epoch": 3.04, + "learning_rate": 0.000326098294884654, + "loss": 2.7234, + "theoretical_loss": 3.595991375130605, + "tokens_seen": 1169499136 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032608826479438313, + "loss": 2.8535, + "theoretical_loss": 3.595972749661884, + "tokens_seen": 1169564672 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032607823470411236, + "loss": 2.9228, + "theoretical_loss": 3.5959541255290115, + "tokens_seen": 1169630208 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003260682046138415, + "loss": 2.7207, + "theoretical_loss": 3.595935502731817, + "tokens_seen": 1169695744 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003260581745235707, + "loss": 2.9597, + "theoretical_loss": 3.5959168812701305, + "tokens_seen": 1169761280 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1874343, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9497649669647217, + "objective/train/theoretical_loss": 3.595912226113362, + "objective/train/tokens_used": 1190237664, + "theoretical_loss": 3.595912226113362, + "tokens_seen": 1169777664 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003260481444332999, + "loss": 2.9427, + "theoretical_loss": 3.595898261143781, + "tokens_seen": 1169826816 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003260381143430291, + "loss": 2.9461, + "theoretical_loss": 3.5958796423525974, + "tokens_seen": 1169892352 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032602808425275827, + "loss": 2.7567, + "theoretical_loss": 3.5958610248964096, + "tokens_seen": 1169957888 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032601805416248745, + "loss": 2.8096, + "theoretical_loss": 3.595842408775048, + "tokens_seen": 1170023424 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032600802407221663, + "loss": 2.9359, + "theoretical_loss": 3.59582379398834, + "tokens_seen": 1170088960 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032599799398194587, + "loss": 2.8098, + "theoretical_loss": 3.595805180536117, + "tokens_seen": 1170154496 + }, + { + "epoch": 3.04, + "learning_rate": 0.000325987963891675, + "loss": 3.0104, + "theoretical_loss": 3.595786568418208, + "tokens_seen": 1170220032 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032597793380140423, + "loss": 2.9517, + "theoretical_loss": 3.5957679576344432, + "tokens_seen": 1170285568 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003259679037111334, + "loss": 2.729, + "theoretical_loss": 3.5957493481846514, + "tokens_seen": 1170351104 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003259578736208626, + "loss": 2.8127, + "theoretical_loss": 3.5957307400686624, + "tokens_seen": 1170416640 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003259478435305918, + "loss": 2.6753, + "theoretical_loss": 3.5957121332863062, + "tokens_seen": 1170482176 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032593781344032095, + "loss": 2.8566, + "theoretical_loss": 3.5956935278374127, + "tokens_seen": 1170547712 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032592778335005014, + "loss": 2.7615, + "theoretical_loss": 3.5956749237218113, + "tokens_seen": 1170613248 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032591775325977937, + "loss": 2.9363, + "theoretical_loss": 3.595656320939332, + "tokens_seen": 1170678784 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003259077231695085, + "loss": 2.97, + "theoretical_loss": 3.595637719489805, + "tokens_seen": 1170744320 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032589769307923773, + "loss": 2.9332, + "theoretical_loss": 3.5956191193730596, + "tokens_seen": 1170809856 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032588766298896686, + "loss": 2.7491, + "theoretical_loss": 3.595600520588926, + "tokens_seen": 1170875392 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003258776328986961, + "loss": 2.7693, + "theoretical_loss": 3.5955819231372343, + "tokens_seen": 1170940928 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003258676028084253, + "loss": 2.8359, + "theoretical_loss": 3.595563327017814, + "tokens_seen": 1171006464 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032585757271815446, + "loss": 2.8282, + "theoretical_loss": 3.5955447322304956, + "tokens_seen": 1171072000 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032584754262788364, + "loss": 2.8105, + "theoretical_loss": 3.595526138775109, + "tokens_seen": 1171137536 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003258375125376128, + "loss": 2.9289, + "theoretical_loss": 3.5955075466514836, + "tokens_seen": 1171203072 + }, + { + "epoch": 3.04, + "learning_rate": 0.000325827482447342, + "loss": 2.9281, + "theoretical_loss": 3.5954889558594507, + "tokens_seen": 1171268608 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032581745235707124, + "loss": 2.9256, + "theoretical_loss": 3.59547036639884, + "tokens_seen": 1171334144 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003258074222668004, + "loss": 2.9495, + "theoretical_loss": 3.5954517782694806, + "tokens_seen": 1171399680 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1876977, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.082993507385254, + "objective/train/theoretical_loss": 3.5954471314451295, + "objective/train/tokens_used": 1191876064, + "theoretical_loss": 3.5954471314451295, + "tokens_seen": 1171416064 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257973921765296, + "loss": 2.8643, + "theoretical_loss": 3.595433191471204, + "tokens_seen": 1171465216 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032578736208625883, + "loss": 3.0065, + "theoretical_loss": 3.5954146060038408, + "tokens_seen": 1171530752 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032577733199598796, + "loss": 2.8201, + "theoretical_loss": 3.5953960218672196, + "tokens_seen": 1171596288 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257673019057172, + "loss": 2.7726, + "theoretical_loss": 3.5953774390611715, + "tokens_seen": 1171661824 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257572718154463, + "loss": 2.7212, + "theoretical_loss": 3.595358857585527, + "tokens_seen": 1171727360 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032574724172517556, + "loss": 2.8901, + "theoretical_loss": 3.595340277440117, + "tokens_seen": 1171792896 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032573721163490474, + "loss": 2.9346, + "theoretical_loss": 3.5953216986247707, + "tokens_seen": 1171858432 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257271815446339, + "loss": 2.6345, + "theoretical_loss": 3.5953031211393185, + "tokens_seen": 1171923968 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257171514543631, + "loss": 3.0151, + "theoretical_loss": 3.595284544983592, + "tokens_seen": 1171989504 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003257071213640923, + "loss": 2.8736, + "theoretical_loss": 3.5952659701574206, + "tokens_seen": 1172055040 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032569709127382146, + "loss": 2.7817, + "theoretical_loss": 3.5952473966606355, + "tokens_seen": 1172120576 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256870611835507, + "loss": 2.9478, + "theoretical_loss": 3.5952288244930672, + "tokens_seen": 1172186112 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256770310932798, + "loss": 2.9292, + "theoretical_loss": 3.5952102536545456, + "tokens_seen": 1172251648 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032566700100300906, + "loss": 2.8028, + "theoretical_loss": 3.5951916841449023, + "tokens_seen": 1172317184 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032565697091273824, + "loss": 2.776, + "theoretical_loss": 3.5951731159639664, + "tokens_seen": 1172382720 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256469408224674, + "loss": 2.8607, + "theoretical_loss": 3.5951545491115704, + "tokens_seen": 1172448256 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256369107321966, + "loss": 2.9102, + "theoretical_loss": 3.595135983587544, + "tokens_seen": 1172513792 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256268806419258, + "loss": 2.8681, + "theoretical_loss": 3.5951174193917175, + "tokens_seen": 1172579328 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032561685055165497, + "loss": 2.781, + "theoretical_loss": 3.595098856523922, + "tokens_seen": 1172644864 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003256068204613842, + "loss": 2.743, + "theoretical_loss": 3.5950802949839895, + "tokens_seen": 1172710400 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032559679037111333, + "loss": 2.7979, + "theoretical_loss": 3.595061734771749, + "tokens_seen": 1172775936 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032558676028084256, + "loss": 2.7053, + "theoretical_loss": 3.5950431758870325, + "tokens_seen": 1172841472 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003255767301905717, + "loss": 2.7421, + "theoretical_loss": 3.5950246183296706, + "tokens_seen": 1172907008 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003255667001003009, + "loss": 2.8931, + "theoretical_loss": 3.5950060620994937, + "tokens_seen": 1172972544 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003255566700100301, + "loss": 2.8538, + "theoretical_loss": 3.5949875071963335, + "tokens_seen": 1173038080 + }, + { + "epoch": 3.04, + "objective/train/docs_used": 1879951, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.022859811782837, + "objective/train/theoretical_loss": 3.5949828686778695, + "objective/train/tokens_used": 1193514464, + "theoretical_loss": 3.5949828686778695, + "tokens_seen": 1173054464 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003255466399197593, + "loss": 2.9868, + "theoretical_loss": 3.59496895362002, + "tokens_seen": 1173103616 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032553660982948847, + "loss": 2.6871, + "theoretical_loss": 3.5949504013703857, + "tokens_seen": 1173169152 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032552657973921765, + "loss": 2.9638, + "theoretical_loss": 3.59493185044726, + "tokens_seen": 1173234688 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032551654964894683, + "loss": 2.9328, + "theoretical_loss": 3.5949133008504752, + "tokens_seen": 1173300224 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032550651955867607, + "loss": 2.7514, + "theoretical_loss": 3.594894752579862, + "tokens_seen": 1173365760 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003254964894684052, + "loss": 2.8549, + "theoretical_loss": 3.5948762056352512, + "tokens_seen": 1173431296 + }, + { + "epoch": 3.04, + "learning_rate": 0.00032548645937813443, + "loss": 2.9192, + "theoretical_loss": 3.5948576600164746, + "tokens_seen": 1173496832 + }, + { + "epoch": 3.04, + "learning_rate": 0.0003254764292878636, + "loss": 2.8148, + "theoretical_loss": 3.594839115723362, + "tokens_seen": 1173562368 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003254663991975928, + "loss": 2.8761, + "theoretical_loss": 3.5948205727557463, + "tokens_seen": 1173627904 + }, + { + "epoch": 3.05, + "learning_rate": 0.000325456369107322, + "loss": 2.8926, + "theoretical_loss": 3.594802031113458, + "tokens_seen": 1173693440 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032544633901705115, + "loss": 2.8534, + "theoretical_loss": 3.5947834907963285, + "tokens_seen": 1173758976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032543630892678034, + "loss": 2.8661, + "theoretical_loss": 3.594764951804189, + "tokens_seen": 1173824512 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032542627883650957, + "loss": 2.7795, + "theoretical_loss": 3.594746414136871, + "tokens_seen": 1173890048 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003254162487462387, + "loss": 2.7647, + "theoretical_loss": 3.594727877794205, + "tokens_seen": 1173955584 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032540621865596793, + "loss": 2.9445, + "theoretical_loss": 3.5947093427760244, + "tokens_seen": 1174021120 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032539618856569706, + "loss": 2.7734, + "theoretical_loss": 3.5946908090821585, + "tokens_seen": 1174086656 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003253861584754263, + "loss": 2.9973, + "theoretical_loss": 3.5946722767124397, + "tokens_seen": 1174152192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003253761283851555, + "loss": 2.8842, + "theoretical_loss": 3.5946537456667, + "tokens_seen": 1174217728 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032536609829488466, + "loss": 2.876, + "theoretical_loss": 3.59463521594477, + "tokens_seen": 1174283264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032535606820461384, + "loss": 2.9295, + "theoretical_loss": 3.5946166875464813, + "tokens_seen": 1174348800 + }, + { + "epoch": 3.05, + "learning_rate": 0.000325346038114343, + "loss": 2.9405, + "theoretical_loss": 3.5945981604716666, + "tokens_seen": 1174414336 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003253360080240722, + "loss": 2.8223, + "theoretical_loss": 3.594579634720156, + "tokens_seen": 1174479872 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032532597793380144, + "loss": 2.9718, + "theoretical_loss": 3.5945611102917825, + "tokens_seen": 1174545408 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032531594784353056, + "loss": 2.8262, + "theoretical_loss": 3.5945425871863765, + "tokens_seen": 1174610944 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003253059177532598, + "loss": 2.8883, + "theoretical_loss": 3.594524065403771, + "tokens_seen": 1174676480 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1881236, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.064966917037964, + "objective/train/theoretical_loss": 3.594519435164787, + "objective/train/tokens_used": 1195152864, + "theoretical_loss": 3.594519435164787, + "tokens_seen": 1174692864 + }, + { + "epoch": 3.05, + "learning_rate": 0.000325295887662989, + "loss": 2.8296, + "theoretical_loss": 3.5945055449437966, + "tokens_seen": 1174742016 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032528585757271816, + "loss": 2.8195, + "theoretical_loss": 3.594487025806286, + "tokens_seen": 1174807552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032527582748244734, + "loss": 2.8449, + "theoretical_loss": 3.5944685079910705, + "tokens_seen": 1174873088 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003252657973921765, + "loss": 2.9725, + "theoretical_loss": 3.5944499914979824, + "tokens_seen": 1174938624 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003252557673019057, + "loss": 2.5492, + "theoretical_loss": 3.5944314763268523, + "tokens_seen": 1175004160 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032524573721163494, + "loss": 2.8425, + "theoretical_loss": 3.594412962477514, + "tokens_seen": 1175069696 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032523570712136407, + "loss": 2.8905, + "theoretical_loss": 3.5943944499497977, + "tokens_seen": 1175135232 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003252256770310933, + "loss": 2.9621, + "theoretical_loss": 3.5943759387435366, + "tokens_seen": 1175200768 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032521564694082243, + "loss": 2.8305, + "theoretical_loss": 3.5943574288585616, + "tokens_seen": 1175266304 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032520561685055166, + "loss": 2.9061, + "theoretical_loss": 3.594338920294706, + "tokens_seen": 1175331840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032519558676028084, + "loss": 2.8299, + "theoretical_loss": 3.5943204130518005, + "tokens_seen": 1175397376 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032518555667001, + "loss": 2.8313, + "theoretical_loss": 3.594301907129678, + "tokens_seen": 1175462912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003251755265797392, + "loss": 2.7007, + "theoretical_loss": 3.5942834025281707, + "tokens_seen": 1175528448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032516549648946844, + "loss": 2.9868, + "theoretical_loss": 3.59426489924711, + "tokens_seen": 1175593984 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032515546639919757, + "loss": 2.8123, + "theoretical_loss": 3.5942463972863288, + "tokens_seen": 1175659520 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003251454363089268, + "loss": 2.7651, + "theoretical_loss": 3.594227896645659, + "tokens_seen": 1175725056 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032513540621865593, + "loss": 2.7247, + "theoretical_loss": 3.594209397324933, + "tokens_seen": 1175790592 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032512537612838517, + "loss": 2.892, + "theoretical_loss": 3.5941908993239826, + "tokens_seen": 1175856128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032511534603811435, + "loss": 3.0715, + "theoretical_loss": 3.5941724026426414, + "tokens_seen": 1175921664 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032510531594784353, + "loss": 2.7794, + "theoretical_loss": 3.59415390728074, + "tokens_seen": 1175987200 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003250952858575727, + "loss": 2.9815, + "theoretical_loss": 3.5941354132381114, + "tokens_seen": 1176052736 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003250852557673019, + "loss": 2.9273, + "theoretical_loss": 3.594116920514588, + "tokens_seen": 1176118272 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032507522567703107, + "loss": 2.8135, + "theoretical_loss": 3.594098429110003, + "tokens_seen": 1176183808 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003250651955867603, + "loss": 2.9235, + "theoretical_loss": 3.594079939024188, + "tokens_seen": 1176249344 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003250551654964895, + "loss": 2.8662, + "theoretical_loss": 3.5940614502569757, + "tokens_seen": 1176314880 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1884052, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.79643177986145, + "objective/train/theoretical_loss": 3.5940568282711842, + "objective/train/tokens_used": 1196791264, + "theoretical_loss": 3.5940568282711842, + "tokens_seen": 1176331264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032504513540621867, + "loss": 2.569, + "theoretical_loss": 3.5940429628081985, + "tokens_seen": 1176380416 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032503510531594785, + "loss": 2.9523, + "theoretical_loss": 3.5940244766776885, + "tokens_seen": 1176445952 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032502507522567703, + "loss": 2.967, + "theoretical_loss": 3.594005991865279, + "tokens_seen": 1176511488 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032501504513540627, + "loss": 2.7426, + "theoretical_loss": 3.5939875083708026, + "tokens_seen": 1176577024 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003250050150451354, + "loss": 2.8887, + "theoretical_loss": 3.5939690261940918, + "tokens_seen": 1176642560 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032499498495486463, + "loss": 2.8478, + "theoretical_loss": 3.593950545334979, + "tokens_seen": 1176708096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003249849548645938, + "loss": 2.7521, + "theoretical_loss": 3.593932065793297, + "tokens_seen": 1176773632 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324974924774323, + "loss": 2.6865, + "theoretical_loss": 3.5939135875688786, + "tokens_seen": 1176839168 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003249648946840522, + "loss": 2.9441, + "theoretical_loss": 3.593895110661557, + "tokens_seen": 1176904704 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032495486459378135, + "loss": 2.6928, + "theoretical_loss": 3.593876635071164, + "tokens_seen": 1176970240 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032494483450351054, + "loss": 2.8559, + "theoretical_loss": 3.5938581607975335, + "tokens_seen": 1177035776 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032493480441323977, + "loss": 2.7407, + "theoretical_loss": 3.5938396878404975, + "tokens_seen": 1177101312 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003249247743229689, + "loss": 2.8174, + "theoretical_loss": 3.593821216199889, + "tokens_seen": 1177166848 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032491474423269813, + "loss": 2.9334, + "theoretical_loss": 3.593802745875541, + "tokens_seen": 1177232384 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032490471414242726, + "loss": 2.9069, + "theoretical_loss": 3.5937842768672867, + "tokens_seen": 1177297920 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003248946840521565, + "loss": 2.8951, + "theoretical_loss": 3.593765809174959, + "tokens_seen": 1177363456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003248846539618857, + "loss": 2.6986, + "theoretical_loss": 3.59374734279839, + "tokens_seen": 1177428992 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032487462387161486, + "loss": 2.9038, + "theoretical_loss": 3.593728877737415, + "tokens_seen": 1177494528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032486459378134404, + "loss": 2.9247, + "theoretical_loss": 3.5937104139918645, + "tokens_seen": 1177560064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003248545636910732, + "loss": 3.118, + "theoretical_loss": 3.5936919515615724, + "tokens_seen": 1177625600 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003248445336008024, + "loss": 2.7168, + "theoretical_loss": 3.5936734904463727, + "tokens_seen": 1177691136 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032483450351053164, + "loss": 2.9399, + "theoretical_loss": 3.593655030646098, + "tokens_seen": 1177756672 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032482447342026076, + "loss": 2.8497, + "theoretical_loss": 3.593636572160581, + "tokens_seen": 1177822208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032481444332999, + "loss": 2.876, + "theoretical_loss": 3.5936181149896553, + "tokens_seen": 1177887744 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003248044132397192, + "loss": 2.8806, + "theoretical_loss": 3.593599659133155, + "tokens_seen": 1177953280 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1886939, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6525890827178955, + "objective/train/theoretical_loss": 3.593595045374389, + "objective/train/tokens_used": 1198429664, + "theoretical_loss": 3.593595045374389, + "tokens_seen": 1177969664 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032479438314944836, + "loss": 2.8704, + "theoretical_loss": 3.5935812045909117, + "tokens_seen": 1178018816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032478435305917754, + "loss": 2.7667, + "theoretical_loss": 3.59356275136276, + "tokens_seen": 1178084352 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003247743229689067, + "loss": 2.8899, + "theoretical_loss": 3.5935442994485323, + "tokens_seen": 1178149888 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003247642928786359, + "loss": 2.7411, + "theoretical_loss": 3.5935258488480626, + "tokens_seen": 1178215424 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032475426278836514, + "loss": 2.8702, + "theoretical_loss": 3.5935073995611844, + "tokens_seen": 1178280960 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032474423269809427, + "loss": 2.7112, + "theoretical_loss": 3.5934889515877306, + "tokens_seen": 1178346496 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003247342026078235, + "loss": 2.9218, + "theoretical_loss": 3.5934705049275353, + "tokens_seen": 1178412032 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032472417251755263, + "loss": 2.8863, + "theoretical_loss": 3.5934520595804313, + "tokens_seen": 1178477568 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032471414242728186, + "loss": 2.9396, + "theoretical_loss": 3.5934336155462523, + "tokens_seen": 1178543104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032470411233701104, + "loss": 2.9666, + "theoretical_loss": 3.593415172824832, + "tokens_seen": 1178608640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003246940822467402, + "loss": 2.8323, + "theoretical_loss": 3.593396731416004, + "tokens_seen": 1178674176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003246840521564694, + "loss": 2.8521, + "theoretical_loss": 3.5933782913196017, + "tokens_seen": 1178739712 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032467402206619864, + "loss": 2.9803, + "theoretical_loss": 3.593359852535459, + "tokens_seen": 1178805248 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032466399197592777, + "loss": 2.778, + "theoretical_loss": 3.5933414150634095, + "tokens_seen": 1178870784 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324653961885657, + "loss": 2.8606, + "theoretical_loss": 3.593322978903287, + "tokens_seen": 1178936320 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032464393179538613, + "loss": 2.8883, + "theoretical_loss": 3.5933045440549245, + "tokens_seen": 1179001856 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032463390170511537, + "loss": 2.7788, + "theoretical_loss": 3.5932861105181573, + "tokens_seen": 1179067392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032462387161484455, + "loss": 2.9095, + "theoretical_loss": 3.5932676782928175, + "tokens_seen": 1179132928 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032461384152457373, + "loss": 2.9117, + "theoretical_loss": 3.59324924737874, + "tokens_seen": 1179198464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003246038114343029, + "loss": 2.8462, + "theoretical_loss": 3.593230817775758, + "tokens_seen": 1179264000 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003245937813440321, + "loss": 2.8931, + "theoretical_loss": 3.5932123894837056, + "tokens_seen": 1179329536 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032458375125376127, + "loss": 2.8727, + "theoretical_loss": 3.5931939625024167, + "tokens_seen": 1179395072 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003245737211634905, + "loss": 2.8112, + "theoretical_loss": 3.5931755368317257, + "tokens_seen": 1179460608 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032456369107321963, + "loss": 2.7193, + "theoretical_loss": 3.593157112471466, + "tokens_seen": 1179526144 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032455366098294887, + "loss": 2.8812, + "theoretical_loss": 3.593138689421472, + "tokens_seen": 1179591680 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1889250, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.812249183654785, + "objective/train/theoretical_loss": 3.593134083863683, + "objective/train/tokens_used": 1200068064, + "theoretical_loss": 3.593134083863683, + "tokens_seen": 1179608064 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324543630892678, + "loss": 2.8721, + "theoretical_loss": 3.593120267681577, + "tokens_seen": 1179657216 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032453360080240723, + "loss": 2.9909, + "theoretical_loss": 3.5931018472516163, + "tokens_seen": 1179722752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003245235707121364, + "loss": 2.8534, + "theoretical_loss": 3.593083428131423, + "tokens_seen": 1179788288 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003245135406218656, + "loss": 2.8508, + "theoretical_loss": 3.5930650103208315, + "tokens_seen": 1179853824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003245035105315948, + "loss": 2.9261, + "theoretical_loss": 3.5930465938196754, + "tokens_seen": 1179919360 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324493480441324, + "loss": 2.8539, + "theoretical_loss": 3.5930281786277902, + "tokens_seen": 1179984896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032448345035105314, + "loss": 2.998, + "theoretical_loss": 3.593009764745009, + "tokens_seen": 1180050432 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003244734202607824, + "loss": 2.8859, + "theoretical_loss": 3.5929913521711665, + "tokens_seen": 1180115968 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003244633901705115, + "loss": 2.7868, + "theoretical_loss": 3.5929729409060966, + "tokens_seen": 1180181504 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032445336008024074, + "loss": 2.7717, + "theoretical_loss": 3.5929545309496342, + "tokens_seen": 1180247040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003244433299899699, + "loss": 2.8898, + "theoretical_loss": 3.592936122301613, + "tokens_seen": 1180312576 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003244332998996991, + "loss": 2.8954, + "theoretical_loss": 3.592917714961868, + "tokens_seen": 1180378112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003244232698094283, + "loss": 2.9062, + "theoretical_loss": 3.592899308930233, + "tokens_seen": 1180443648 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032441323971915746, + "loss": 2.9638, + "theoretical_loss": 3.5928809042065426, + "tokens_seen": 1180509184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032440320962888664, + "loss": 2.755, + "theoretical_loss": 3.5928625007906314, + "tokens_seen": 1180574720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003243931795386159, + "loss": 2.8966, + "theoretical_loss": 3.592844098682334, + "tokens_seen": 1180640256 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324383149448345, + "loss": 2.9552, + "theoretical_loss": 3.5928256978814845, + "tokens_seen": 1180705792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032437311935807424, + "loss": 2.8289, + "theoretical_loss": 3.5928072983879176, + "tokens_seen": 1180771328 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032436308926780337, + "loss": 2.7356, + "theoretical_loss": 3.5927889002014686, + "tokens_seen": 1180836864 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003243530591775326, + "loss": 2.8424, + "theoretical_loss": 3.5927705033219706, + "tokens_seen": 1180902400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003243430290872618, + "loss": 2.9685, + "theoretical_loss": 3.5927521077492597, + "tokens_seen": 1180967936 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032433299899699096, + "loss": 2.8856, + "theoretical_loss": 3.5927337134831694, + "tokens_seen": 1181033472 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003243229689067202, + "loss": 2.946, + "theoretical_loss": 3.5927153205235354, + "tokens_seen": 1181099008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003243129388164494, + "loss": 2.8839, + "theoretical_loss": 3.5926969288701915, + "tokens_seen": 1181164544 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032430290872617856, + "loss": 2.7087, + "theoretical_loss": 3.5926785385229736, + "tokens_seen": 1181230080 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1892226, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8384485244750977, + "objective/train/theoretical_loss": 3.592673941140232, + "objective/train/tokens_used": 1201706464, + "theoretical_loss": 3.592673941140232, + "tokens_seen": 1181246464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032429287863590774, + "loss": 2.8557, + "theoretical_loss": 3.5926601494817154, + "tokens_seen": 1181295616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003242828485456369, + "loss": 2.7033, + "theoretical_loss": 3.5926417617462523, + "tokens_seen": 1181361152 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003242728184553661, + "loss": 2.9241, + "theoretical_loss": 3.5926233753164185, + "tokens_seen": 1181426688 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032426278836509534, + "loss": 2.9088, + "theoretical_loss": 3.59260499019205, + "tokens_seen": 1181492224 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032425275827482447, + "loss": 2.9244, + "theoretical_loss": 3.5925866063729806, + "tokens_seen": 1181557760 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003242427281845537, + "loss": 2.849, + "theoretical_loss": 3.592568223859046, + "tokens_seen": 1181623296 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032423269809428283, + "loss": 2.8959, + "theoretical_loss": 3.5925498426500804, + "tokens_seen": 1181688832 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032422266800401206, + "loss": 2.9283, + "theoretical_loss": 3.5925314627459195, + "tokens_seen": 1181754368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032421263791374125, + "loss": 2.7202, + "theoretical_loss": 3.592513084146398, + "tokens_seen": 1181819904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003242026078234704, + "loss": 2.9288, + "theoretical_loss": 3.592494706851351, + "tokens_seen": 1181885440 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003241925777331996, + "loss": 2.8248, + "theoretical_loss": 3.592476330860614, + "tokens_seen": 1181950976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032418254764292884, + "loss": 2.8457, + "theoretical_loss": 3.5924579561740213, + "tokens_seen": 1182016512 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032417251755265797, + "loss": 2.8718, + "theoretical_loss": 3.5924395827914086, + "tokens_seen": 1182082048 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003241624874623872, + "loss": 2.8944, + "theoretical_loss": 3.592421210712611, + "tokens_seen": 1182147584 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032415245737211633, + "loss": 2.7142, + "theoretical_loss": 3.5924028399374635, + "tokens_seen": 1182213120 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032414242728184557, + "loss": 2.9214, + "theoretical_loss": 3.5923844704658014, + "tokens_seen": 1182278656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032413239719157475, + "loss": 2.8926, + "theoretical_loss": 3.5923661022974605, + "tokens_seen": 1182344192 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032412236710130393, + "loss": 2.8802, + "theoretical_loss": 3.5923477354322753, + "tokens_seen": 1182409728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003241123370110331, + "loss": 2.9249, + "theoretical_loss": 3.5923293698700816, + "tokens_seen": 1182475264 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003241023069207623, + "loss": 2.8084, + "theoretical_loss": 3.5923110056107146, + "tokens_seen": 1182540800 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032409227683049147, + "loss": 2.9094, + "theoretical_loss": 3.5922926426540096, + "tokens_seen": 1182606336 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003240822467402207, + "loss": 2.8435, + "theoretical_loss": 3.592274280999802, + "tokens_seen": 1182671872 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032407221664994984, + "loss": 2.9512, + "theoretical_loss": 3.5922559206479274, + "tokens_seen": 1182737408 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032406218655967907, + "loss": 2.9492, + "theoretical_loss": 3.5922375615982216, + "tokens_seen": 1182802944 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003240521564694082, + "loss": 2.8458, + "theoretical_loss": 3.5922192038505196, + "tokens_seen": 1182868480 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1895058, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1715705394744873, + "objective/train/theoretical_loss": 3.592214614617013, + "objective/train/tokens_used": 1203344864, + "theoretical_loss": 3.592214614617013, + "tokens_seen": 1182884864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032404212637913743, + "loss": 2.8691, + "theoretical_loss": 3.5922008474046567, + "tokens_seen": 1182934016 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003240320962888666, + "loss": 3.0965, + "theoretical_loss": 3.59218249226047, + "tokens_seen": 1182999552 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003240220661985958, + "loss": 2.8419, + "theoretical_loss": 3.5921641384177927, + "tokens_seen": 1183065088 + }, + { + "epoch": 3.05, + "learning_rate": 0.000324012036108325, + "loss": 2.7573, + "theoretical_loss": 3.592145785876462, + "tokens_seen": 1183130624 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003240020060180542, + "loss": 2.7791, + "theoretical_loss": 3.5921274346363132, + "tokens_seen": 1183196160 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032399197592778334, + "loss": 2.9046, + "theoretical_loss": 3.5921090846971824, + "tokens_seen": 1183261696 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239819458375126, + "loss": 2.8098, + "theoretical_loss": 3.5920907360589043, + "tokens_seen": 1183327232 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239719157472417, + "loss": 2.8031, + "theoretical_loss": 3.592072388721316, + "tokens_seen": 1183392768 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032396188565697094, + "loss": 2.8649, + "theoretical_loss": 3.5920540426842518, + "tokens_seen": 1183458304 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239518555667001, + "loss": 2.9791, + "theoretical_loss": 3.5920356979475487, + "tokens_seen": 1183523840 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239418254764293, + "loss": 3.0084, + "theoretical_loss": 3.592017354511042, + "tokens_seen": 1183589376 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239317953861585, + "loss": 2.9578, + "theoretical_loss": 3.5919990123745675, + "tokens_seen": 1183654912 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032392176529588766, + "loss": 2.9141, + "theoretical_loss": 3.5919806715379616, + "tokens_seen": 1183720448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032391173520561684, + "loss": 3.0509, + "theoretical_loss": 3.591962332001059, + "tokens_seen": 1183785984 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003239017051153461, + "loss": 2.9885, + "theoretical_loss": 3.591943993763697, + "tokens_seen": 1183851520 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003238916750250752, + "loss": 2.6508, + "theoretical_loss": 3.591925656825711, + "tokens_seen": 1183917056 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032388164493480444, + "loss": 2.9565, + "theoretical_loss": 3.5919073211869375, + "tokens_seen": 1183982592 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032387161484453357, + "loss": 2.8851, + "theoretical_loss": 3.591888986847212, + "tokens_seen": 1184048128 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003238615847542628, + "loss": 2.8376, + "theoretical_loss": 3.5918706538063705, + "tokens_seen": 1184113664 + }, + { + "epoch": 3.05, + "learning_rate": 0.000323851554663992, + "loss": 2.8123, + "theoretical_loss": 3.5918523220642493, + "tokens_seen": 1184179200 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032384152457372116, + "loss": 2.9835, + "theoretical_loss": 3.5918339916206845, + "tokens_seen": 1184244736 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032383149448345034, + "loss": 2.8634, + "theoretical_loss": 3.591815662475512, + "tokens_seen": 1184310272 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003238214643931796, + "loss": 2.9248, + "theoretical_loss": 3.591797334628569, + "tokens_seen": 1184375808 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003238114343029087, + "loss": 2.8638, + "theoretical_loss": 3.5917790080796905, + "tokens_seen": 1184441344 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032380140421263794, + "loss": 2.8446, + "theoretical_loss": 3.5917606828287134, + "tokens_seen": 1184506880 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1898140, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.047255754470825, + "objective/train/theoretical_loss": 3.5917561017187474, + "objective/train/tokens_used": 1204983264, + "theoretical_loss": 3.5917561017187474, + "tokens_seen": 1184523264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032379137412236707, + "loss": 3.0347, + "theoretical_loss": 3.591742358875474, + "tokens_seen": 1184572416 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003237813440320963, + "loss": 2.8384, + "theoretical_loss": 3.591724036219808, + "tokens_seen": 1184637952 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003237713139418255, + "loss": 2.9119, + "theoretical_loss": 3.5917057148615528, + "tokens_seen": 1184703488 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032376128385155467, + "loss": 2.872, + "theoretical_loss": 3.5916873948005437, + "tokens_seen": 1184769024 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032375125376128385, + "loss": 2.7477, + "theoretical_loss": 3.591669076036618, + "tokens_seen": 1184834560 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032374122367101303, + "loss": 2.8796, + "theoretical_loss": 3.591650758569611, + "tokens_seen": 1184900096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003237311935807422, + "loss": 2.8453, + "theoretical_loss": 3.5916324423993604, + "tokens_seen": 1184965632 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032372116349047145, + "loss": 3.1081, + "theoretical_loss": 3.591614127525702, + "tokens_seen": 1185031168 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032371113340020057, + "loss": 2.7328, + "theoretical_loss": 3.591595813948472, + "tokens_seen": 1185096704 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003237011033099298, + "loss": 2.8408, + "theoretical_loss": 3.591577501667508, + "tokens_seen": 1185162240 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032369107321965893, + "loss": 2.9436, + "theoretical_loss": 3.591559190682646, + "tokens_seen": 1185227776 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032368104312938817, + "loss": 2.9952, + "theoretical_loss": 3.5915408809937217, + "tokens_seen": 1185293312 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032367101303911735, + "loss": 2.8975, + "theoretical_loss": 3.5915225726005735, + "tokens_seen": 1185358848 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032366098294884653, + "loss": 2.8163, + "theoretical_loss": 3.591504265503037, + "tokens_seen": 1185424384 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003236509528585757, + "loss": 2.945, + "theoretical_loss": 3.5914859597009485, + "tokens_seen": 1185489920 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032364092276830495, + "loss": 2.8179, + "theoretical_loss": 3.591467655194146, + "tokens_seen": 1185555456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003236308926780341, + "loss": 2.8065, + "theoretical_loss": 3.5914493519824653, + "tokens_seen": 1185620992 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003236208625877633, + "loss": 2.9599, + "theoretical_loss": 3.591431050065743, + "tokens_seen": 1185686528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032361083249749244, + "loss": 2.9657, + "theoretical_loss": 3.591412749443817, + "tokens_seen": 1185752064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003236008024072217, + "loss": 2.9489, + "theoretical_loss": 3.591394450116523, + "tokens_seen": 1185817600 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032359077231695085, + "loss": 2.8909, + "theoretical_loss": 3.5913761520836984, + "tokens_seen": 1185883136 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032358074222668004, + "loss": 2.8201, + "theoretical_loss": 3.5913578553451804, + "tokens_seen": 1185948672 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032357071213640927, + "loss": 2.8423, + "theoretical_loss": 3.5913395599008053, + "tokens_seen": 1186014208 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003235606820461384, + "loss": 2.913, + "theoretical_loss": 3.5913212657504103, + "tokens_seen": 1186079744 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032355065195586763, + "loss": 2.8308, + "theoretical_loss": 3.591302972893833, + "tokens_seen": 1186145280 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1901076, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7757866382598877, + "objective/train/theoretical_loss": 3.591298399881828, + "objective/train/tokens_used": 1206621664, + "theoretical_loss": 3.591298399881828, + "tokens_seen": 1186161664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003235406218655968, + "loss": 2.9251, + "theoretical_loss": 3.591284681330909, + "tokens_seen": 1186210816 + }, + { + "epoch": 3.05, + "learning_rate": 0.000323530591775326, + "loss": 2.8303, + "theoretical_loss": 3.5912663910614766, + "tokens_seen": 1186276352 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003235205616850552, + "loss": 2.8406, + "theoretical_loss": 3.5912481020853724, + "tokens_seen": 1186341888 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003235105315947844, + "loss": 2.9026, + "theoretical_loss": 3.5912298144024337, + "tokens_seen": 1186407424 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032350050150451354, + "loss": 2.8821, + "theoretical_loss": 3.5912115280124977, + "tokens_seen": 1186472960 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234904714142428, + "loss": 2.8315, + "theoretical_loss": 3.591193242915401, + "tokens_seen": 1186538496 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234804413239719, + "loss": 2.8596, + "theoretical_loss": 3.5911749591109814, + "tokens_seen": 1186604032 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032347041123370114, + "loss": 2.8516, + "theoretical_loss": 3.591156676599076, + "tokens_seen": 1186669568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234603811434303, + "loss": 2.6858, + "theoretical_loss": 3.591138395379522, + "tokens_seen": 1186735104 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234503510531595, + "loss": 2.9361, + "theoretical_loss": 3.5911201154521564, + "tokens_seen": 1186800640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234403209628887, + "loss": 2.857, + "theoretical_loss": 3.591101836816817, + "tokens_seen": 1186866176 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032343029087261786, + "loss": 2.8614, + "theoretical_loss": 3.591083559473341, + "tokens_seen": 1186931712 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032342026078234704, + "loss": 3.0072, + "theoretical_loss": 3.5910652834215657, + "tokens_seen": 1186997248 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234102306920763, + "loss": 2.9952, + "theoretical_loss": 3.591047008661328, + "tokens_seen": 1187062784 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003234002006018054, + "loss": 2.7801, + "theoretical_loss": 3.5910287351924666, + "tokens_seen": 1187128320 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032339017051153464, + "loss": 2.8549, + "theoretical_loss": 3.5910104630148174, + "tokens_seen": 1187193856 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032338014042126377, + "loss": 2.7532, + "theoretical_loss": 3.590992192128219, + "tokens_seen": 1187259392 + }, + { + "epoch": 3.05, + "learning_rate": 0.000323370110330993, + "loss": 2.9029, + "theoretical_loss": 3.590973922532509, + "tokens_seen": 1187324928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003233600802407222, + "loss": 2.8619, + "theoretical_loss": 3.5909556542275243, + "tokens_seen": 1187390464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032335005015045136, + "loss": 2.929, + "theoretical_loss": 3.5909373872131027, + "tokens_seen": 1187456000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032334002006018054, + "loss": 2.9138, + "theoretical_loss": 3.5909191214890814, + "tokens_seen": 1187521536 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003233299899699098, + "loss": 2.7805, + "theoretical_loss": 3.590900857055299, + "tokens_seen": 1187587072 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003233199598796389, + "loss": 2.9934, + "theoretical_loss": 3.590882593911593, + "tokens_seen": 1187652608 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032330992978936814, + "loss": 2.8763, + "theoretical_loss": 3.5908643320578, + "tokens_seen": 1187718144 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032329989969909727, + "loss": 2.9854, + "theoretical_loss": 3.5908460714937593, + "tokens_seen": 1187783680 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1903128, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1874547004699707, + "objective/train/theoretical_loss": 3.5908415065542534, + "objective/train/tokens_used": 1208260064, + "theoretical_loss": 3.5908415065542534, + "tokens_seen": 1187800064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003232898696088265, + "loss": 2.9833, + "theoretical_loss": 3.590827812219307, + "tokens_seen": 1187849216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003232798395185557, + "loss": 2.8926, + "theoretical_loss": 3.590809554234282, + "tokens_seen": 1187914752 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032326980942828487, + "loss": 2.9923, + "theoretical_loss": 3.590791297538522, + "tokens_seen": 1187980288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032325977933801405, + "loss": 2.8404, + "theoretical_loss": 3.5907730421318647, + "tokens_seen": 1188045824 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032324974924774323, + "loss": 2.8457, + "theoretical_loss": 3.5907547880141477, + "tokens_seen": 1188111360 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003232397191574724, + "loss": 2.9149, + "theoretical_loss": 3.5907365351852096, + "tokens_seen": 1188176896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032322968906720165, + "loss": 2.7827, + "theoretical_loss": 3.5907182836448874, + "tokens_seen": 1188242432 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032321965897693077, + "loss": 2.9237, + "theoretical_loss": 3.59070003339302, + "tokens_seen": 1188307968 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032320962888666, + "loss": 2.9337, + "theoretical_loss": 3.590681784429444, + "tokens_seen": 1188373504 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032319959879638913, + "loss": 2.9097, + "theoretical_loss": 3.5906635367539996, + "tokens_seen": 1188439040 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032318956870611837, + "loss": 2.989, + "theoretical_loss": 3.5906452903665227, + "tokens_seen": 1188504576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032317953861584755, + "loss": 2.9412, + "theoretical_loss": 3.590627045266853, + "tokens_seen": 1188570112 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032316950852557673, + "loss": 2.9308, + "theoretical_loss": 3.5906088014548274, + "tokens_seen": 1188635648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003231594784353059, + "loss": 2.8039, + "theoretical_loss": 3.590590558930285, + "tokens_seen": 1188701184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032314944834503515, + "loss": 2.9768, + "theoretical_loss": 3.590572317693063, + "tokens_seen": 1188766720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003231394182547643, + "loss": 2.8778, + "theoretical_loss": 3.590554077743, + "tokens_seen": 1188832256 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003231293881644935, + "loss": 2.9935, + "theoretical_loss": 3.5905358390799345, + "tokens_seen": 1188897792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032311935807422264, + "loss": 2.9601, + "theoretical_loss": 3.590517601703705, + "tokens_seen": 1188963328 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003231093279839519, + "loss": 2.9098, + "theoretical_loss": 3.590499365614149, + "tokens_seen": 1189028864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032309929789368105, + "loss": 2.7965, + "theoretical_loss": 3.590481130811105, + "tokens_seen": 1189094400 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032308926780341024, + "loss": 2.9028, + "theoretical_loss": 3.5904628972944117, + "tokens_seen": 1189159936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003230792377131394, + "loss": 2.8315, + "theoretical_loss": 3.5904446650639072, + "tokens_seen": 1189225472 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003230692076228686, + "loss": 2.8765, + "theoretical_loss": 3.59042643411943, + "tokens_seen": 1189291008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003230591775325978, + "loss": 2.8996, + "theoretical_loss": 3.590408204460819, + "tokens_seen": 1189356544 + }, + { + "epoch": 3.05, + "learning_rate": 0.000323049147442327, + "loss": 2.7224, + "theoretical_loss": 3.590389976087912, + "tokens_seen": 1189422080 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1906045, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.084784984588623, + "objective/train/theoretical_loss": 3.590385419195557, + "objective/train/tokens_used": 1209898464, + "theoretical_loss": 3.590385419195557, + "tokens_seen": 1189438464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032303911735205614, + "loss": 2.9355, + "theoretical_loss": 3.590371749000547, + "tokens_seen": 1189487616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003230290872617854, + "loss": 2.9103, + "theoretical_loss": 3.590353523198564, + "tokens_seen": 1189553152 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003230190571715145, + "loss": 2.7742, + "theoretical_loss": 3.5903352986818, + "tokens_seen": 1189618688 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032300902708124374, + "loss": 2.8023, + "theoretical_loss": 3.590317075450095, + "tokens_seen": 1189684224 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229989969909729, + "loss": 2.8428, + "theoretical_loss": 3.5902988535032865, + "tokens_seen": 1189749760 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229889669007021, + "loss": 2.8276, + "theoretical_loss": 3.590280632841213, + "tokens_seen": 1189815296 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229789368104313, + "loss": 2.8054, + "theoretical_loss": 3.590262413463715, + "tokens_seen": 1189880832 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229689067201605, + "loss": 2.8578, + "theoretical_loss": 3.590244195370629, + "tokens_seen": 1189946368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032295887662988964, + "loss": 2.8816, + "theoretical_loss": 3.5902259785617945, + "tokens_seen": 1190011904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229488465396189, + "loss": 2.975, + "theoretical_loss": 3.590207763037051, + "tokens_seen": 1190077440 + }, + { + "epoch": 3.05, + "learning_rate": 0.000322938816449348, + "loss": 2.8849, + "theoretical_loss": 3.5901895487962365, + "tokens_seen": 1190142976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032292878635907724, + "loss": 2.7547, + "theoretical_loss": 3.5901713358391896, + "tokens_seen": 1190208512 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229187562688064, + "loss": 2.8691, + "theoretical_loss": 3.5901531241657496, + "tokens_seen": 1190274048 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003229087261785356, + "loss": 2.9369, + "theoretical_loss": 3.5901349137757554, + "tokens_seen": 1190339584 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003228986960882648, + "loss": 2.9609, + "theoretical_loss": 3.590116704669046, + "tokens_seen": 1190405120 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032288866599799397, + "loss": 2.9137, + "theoretical_loss": 3.5900984968454592, + "tokens_seen": 1190470656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032287863590772315, + "loss": 2.827, + "theoretical_loss": 3.5900802903048357, + "tokens_seen": 1190536192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003228686058174524, + "loss": 2.8717, + "theoretical_loss": 3.5900620850470135, + "tokens_seen": 1190601728 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003228585757271815, + "loss": 2.8068, + "theoretical_loss": 3.5900438810718316, + "tokens_seen": 1190667264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032284854563691074, + "loss": 2.9701, + "theoretical_loss": 3.590025678379129, + "tokens_seen": 1190732800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003228385155466399, + "loss": 2.9935, + "theoretical_loss": 3.5900074769687453, + "tokens_seen": 1190798336 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003228284854563691, + "loss": 2.9532, + "theoretical_loss": 3.589989276840519, + "tokens_seen": 1190863872 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032281845536609834, + "loss": 2.8673, + "theoretical_loss": 3.5899710779942895, + "tokens_seen": 1190929408 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032280842527582747, + "loss": 2.8955, + "theoretical_loss": 3.589952880429896, + "tokens_seen": 1190994944 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003227983951855567, + "loss": 2.744, + "theoretical_loss": 3.5899346841471775, + "tokens_seen": 1191060480 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1908714, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0678317546844482, + "objective/train/theoretical_loss": 3.5899301352767408, + "objective/train/tokens_used": 1211536864, + "theoretical_loss": 3.5899301352767408, + "tokens_seen": 1191076864 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003227883650952859, + "loss": 2.9837, + "theoretical_loss": 3.589916489145973, + "tokens_seen": 1191126016 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032277833500501507, + "loss": 2.9169, + "theoretical_loss": 3.589898295426123, + "tokens_seen": 1191191552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032276830491474425, + "loss": 2.9582, + "theoretical_loss": 3.589880102987465, + "tokens_seen": 1191257088 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032275827482447343, + "loss": 2.8143, + "theoretical_loss": 3.58986191182984, + "tokens_seen": 1191322624 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003227482447342026, + "loss": 3.0239, + "theoretical_loss": 3.589843721953086, + "tokens_seen": 1191388160 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032273821464393185, + "loss": 2.7991, + "theoretical_loss": 3.5898255333570424, + "tokens_seen": 1191453696 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032272818455366097, + "loss": 2.9531, + "theoretical_loss": 3.5898073460415496, + "tokens_seen": 1191519232 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003227181544633902, + "loss": 2.8713, + "theoretical_loss": 3.5897891600064464, + "tokens_seen": 1191584768 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032270812437311933, + "loss": 2.9922, + "theoretical_loss": 3.5897709752515725, + "tokens_seen": 1191650304 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032269809428284857, + "loss": 2.758, + "theoretical_loss": 3.5897527917767666, + "tokens_seen": 1191715840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032268806419257775, + "loss": 2.8315, + "theoretical_loss": 3.5897346095818694, + "tokens_seen": 1191781376 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032267803410230693, + "loss": 2.9661, + "theoretical_loss": 3.589716428666719, + "tokens_seen": 1191846912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003226680040120361, + "loss": 2.9071, + "theoretical_loss": 3.589698249031157, + "tokens_seen": 1191912448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032265797392176535, + "loss": 2.9048, + "theoretical_loss": 3.589680070675021, + "tokens_seen": 1191977984 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003226479438314945, + "loss": 2.9068, + "theoretical_loss": 3.5896618935981515, + "tokens_seen": 1192043520 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003226379137412237, + "loss": 2.9344, + "theoretical_loss": 3.5896437178003877, + "tokens_seen": 1192109056 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032262788365095284, + "loss": 2.8817, + "theoretical_loss": 3.58962554328157, + "tokens_seen": 1192174592 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003226178535606821, + "loss": 2.9185, + "theoretical_loss": 3.589607370041538, + "tokens_seen": 1192240128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032260782347041125, + "loss": 2.9838, + "theoretical_loss": 3.5895891980801307, + "tokens_seen": 1192305664 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032259779338014044, + "loss": 2.945, + "theoretical_loss": 3.5895710273971884, + "tokens_seen": 1192371200 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225877632898696, + "loss": 2.9456, + "theoretical_loss": 3.589552857992551, + "tokens_seen": 1192436736 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225777331995988, + "loss": 2.8362, + "theoretical_loss": 3.589534689866058, + "tokens_seen": 1192502272 + }, + { + "epoch": 3.05, + "learning_rate": 0.000322567703109328, + "loss": 2.9089, + "theoretical_loss": 3.5895165230175494, + "tokens_seen": 1192567808 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225576730190572, + "loss": 2.7801, + "theoretical_loss": 3.5894983574468653, + "tokens_seen": 1192633344 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032254764292878634, + "loss": 2.8648, + "theoretical_loss": 3.589480193153845, + "tokens_seen": 1192698880 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1911629, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.984509229660034, + "objective/train/theoretical_loss": 3.5894756522802065, + "objective/train/tokens_used": 1213175264, + "theoretical_loss": 3.5894756522802065, + "tokens_seen": 1192715264 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225376128385156, + "loss": 2.8531, + "theoretical_loss": 3.5894620301383293, + "tokens_seen": 1192764416 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225275827482447, + "loss": 2.8391, + "theoretical_loss": 3.5894438684001573, + "tokens_seen": 1192829952 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032251755265797394, + "loss": 2.8204, + "theoretical_loss": 3.58942570793917, + "tokens_seen": 1192895488 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003225075225677031, + "loss": 2.9024, + "theoretical_loss": 3.589407548755206, + "tokens_seen": 1192961024 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224974924774323, + "loss": 2.8689, + "theoretical_loss": 3.589389390848107, + "tokens_seen": 1193026560 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224874623871615, + "loss": 2.7839, + "theoretical_loss": 3.5893712342177118, + "tokens_seen": 1193092096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224774322968907, + "loss": 3.0682, + "theoretical_loss": 3.5893530788638612, + "tokens_seen": 1193157632 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032246740220661984, + "loss": 2.8445, + "theoretical_loss": 3.589334924786395, + "tokens_seen": 1193223168 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224573721163491, + "loss": 2.8852, + "theoretical_loss": 3.5893167719851533, + "tokens_seen": 1193288704 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224473420260782, + "loss": 2.8783, + "theoretical_loss": 3.589298620459977, + "tokens_seen": 1193354240 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032243731193580744, + "loss": 2.9437, + "theoretical_loss": 3.5892804702107055, + "tokens_seen": 1193419776 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224272818455366, + "loss": 2.9037, + "theoretical_loss": 3.5892623212371793, + "tokens_seen": 1193485312 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003224172517552658, + "loss": 2.8492, + "theoretical_loss": 3.589244173539239, + "tokens_seen": 1193550848 + }, + { + "epoch": 3.05, + "learning_rate": 0.000322407221664995, + "loss": 2.8264, + "theoretical_loss": 3.5892260271167244, + "tokens_seen": 1193616384 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032239719157472417, + "loss": 2.8417, + "theoretical_loss": 3.589207881969476, + "tokens_seen": 1193681920 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032238716148445335, + "loss": 2.7277, + "theoretical_loss": 3.5891897380973345, + "tokens_seen": 1193747456 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003223771313941826, + "loss": 2.7649, + "theoretical_loss": 3.58917159550014, + "tokens_seen": 1193812992 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003223671013039117, + "loss": 2.9972, + "theoretical_loss": 3.5891534541777332, + "tokens_seen": 1193878528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032235707121364095, + "loss": 2.8223, + "theoretical_loss": 3.5891353141299547, + "tokens_seen": 1193944064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003223470411233701, + "loss": 2.9365, + "theoretical_loss": 3.5891171753566438, + "tokens_seen": 1194009600 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003223370110330993, + "loss": 2.9525, + "theoretical_loss": 3.5890990378576424, + "tokens_seen": 1194075136 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003223269809428285, + "loss": 2.776, + "theoretical_loss": 3.5890809016327903, + "tokens_seen": 1194140672 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032231695085255767, + "loss": 2.9642, + "theoretical_loss": 3.5890627666819284, + "tokens_seen": 1194206208 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032230692076228685, + "loss": 2.8862, + "theoretical_loss": 3.589044633004897, + "tokens_seen": 1194271744 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003222968906720161, + "loss": 2.7677, + "theoretical_loss": 3.589026500601537, + "tokens_seen": 1194337280 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1912989, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6457345485687256, + "objective/train/theoretical_loss": 3.58902196769969, + "objective/train/tokens_used": 1214813664, + "theoretical_loss": 3.58902196769969, + "tokens_seen": 1194353664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003222868605817452, + "loss": 2.7448, + "theoretical_loss": 3.5890083694716894, + "tokens_seen": 1194402816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032227683049147445, + "loss": 3.0028, + "theoretical_loss": 3.5889902396151943, + "tokens_seen": 1194468352 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003222668004012036, + "loss": 2.651, + "theoretical_loss": 3.588972111031892, + "tokens_seen": 1194533888 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003222567703109328, + "loss": 2.9764, + "theoretical_loss": 3.588953983721624, + "tokens_seen": 1194599424 + }, + { + "epoch": 3.05, + "learning_rate": 0.000322246740220662, + "loss": 2.883, + "theoretical_loss": 3.5889358576842314, + "tokens_seen": 1194664960 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032223671013039117, + "loss": 2.8954, + "theoretical_loss": 3.588917732919554, + "tokens_seen": 1194730496 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032222668004012035, + "loss": 2.7806, + "theoretical_loss": 3.5888996094274335, + "tokens_seen": 1194796032 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032221664994984953, + "loss": 2.7491, + "theoretical_loss": 3.5888814872077104, + "tokens_seen": 1194861568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003222066198595787, + "loss": 2.9346, + "theoretical_loss": 3.5888633662602247, + "tokens_seen": 1194927104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032219658976930795, + "loss": 2.9027, + "theoretical_loss": 3.5888452465848193, + "tokens_seen": 1194992640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221865596790371, + "loss": 2.8296, + "theoretical_loss": 3.5888271281813333, + "tokens_seen": 1195058176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221765295887663, + "loss": 2.7403, + "theoretical_loss": 3.5888090110496087, + "tokens_seen": 1195123712 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221664994984955, + "loss": 2.7944, + "theoretical_loss": 3.5887908951894865, + "tokens_seen": 1195189248 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221564694082247, + "loss": 3.0392, + "theoretical_loss": 3.5887727806008067, + "tokens_seen": 1195254784 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032214643931795386, + "loss": 2.6396, + "theoretical_loss": 3.5887546672834114, + "tokens_seen": 1195320320 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032213640922768304, + "loss": 2.8709, + "theoretical_loss": 3.5887365552371415, + "tokens_seen": 1195385856 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221263791374122, + "loss": 2.8602, + "theoretical_loss": 3.588718444461838, + "tokens_seen": 1195451392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032211634904714145, + "loss": 2.8581, + "theoretical_loss": 3.588700334957342, + "tokens_seen": 1195516928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003221063189568706, + "loss": 2.7344, + "theoretical_loss": 3.5886822267234946, + "tokens_seen": 1195582464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220962888665998, + "loss": 2.8148, + "theoretical_loss": 3.5886641197601374, + "tokens_seen": 1195648000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032208625877632894, + "loss": 2.9216, + "theoretical_loss": 3.5886460140671113, + "tokens_seen": 1195713536 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220762286860582, + "loss": 3.0915, + "theoretical_loss": 3.5886279096442575, + "tokens_seen": 1195779072 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220661985957874, + "loss": 2.9727, + "theoretical_loss": 3.5886098064914176, + "tokens_seen": 1195844608 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032205616850551654, + "loss": 2.797, + "theoretical_loss": 3.588591704608432, + "tokens_seen": 1195910144 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220461384152458, + "loss": 2.7249, + "theoretical_loss": 3.588573603995144, + "tokens_seen": 1195975680 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1915819, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.474409818649292, + "objective/train/theoretical_loss": 3.5885690790401927, + "objective/train/tokens_used": 1216452064, + "theoretical_loss": 3.5885690790401927, + "tokens_seen": 1195992064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220361083249749, + "loss": 2.8146, + "theoretical_loss": 3.588555504651392, + "tokens_seen": 1196041216 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032202607823470414, + "loss": 2.9569, + "theoretical_loss": 3.5885374065770206, + "tokens_seen": 1196106752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220160481444333, + "loss": 2.8999, + "theoretical_loss": 3.5885193097718693, + "tokens_seen": 1196172288 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003220060180541625, + "loss": 2.9242, + "theoretical_loss": 3.58850121423578, + "tokens_seen": 1196237824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219959879638917, + "loss": 2.7884, + "theoretical_loss": 3.5884831199685943, + "tokens_seen": 1196303360 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219859578736209, + "loss": 2.8653, + "theoretical_loss": 3.5884650269701535, + "tokens_seen": 1196368896 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032197592778335004, + "loss": 2.9379, + "theoretical_loss": 3.5884469352402992, + "tokens_seen": 1196434432 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219658976930793, + "loss": 2.8118, + "theoretical_loss": 3.588428844778873, + "tokens_seen": 1196499968 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219558676028084, + "loss": 2.8087, + "theoretical_loss": 3.5884107555857168, + "tokens_seen": 1196565504 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032194583751253764, + "loss": 2.9743, + "theoretical_loss": 3.588392667660672, + "tokens_seen": 1196631040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219358074222668, + "loss": 2.9932, + "theoretical_loss": 3.58837458100358, + "tokens_seen": 1196696576 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321925777331996, + "loss": 2.9015, + "theoretical_loss": 3.5883564956142826, + "tokens_seen": 1196762112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003219157472417252, + "loss": 2.8076, + "theoretical_loss": 3.5883384114926216, + "tokens_seen": 1196827648 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032190571715145437, + "loss": 3.0673, + "theoretical_loss": 3.588320328638439, + "tokens_seen": 1196893184 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032189568706118355, + "loss": 2.9501, + "theoretical_loss": 3.588302247051576, + "tokens_seen": 1196958720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218856569709128, + "loss": 2.9179, + "theoretical_loss": 3.5882841667318752, + "tokens_seen": 1197024256 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218756268806419, + "loss": 3.0072, + "theoretical_loss": 3.588266087679178, + "tokens_seen": 1197089792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032186559679037115, + "loss": 2.8542, + "theoretical_loss": 3.5882480098933263, + "tokens_seen": 1197155328 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218555667001003, + "loss": 2.8668, + "theoretical_loss": 3.588229933374161, + "tokens_seen": 1197220864 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218455366098295, + "loss": 2.826, + "theoretical_loss": 3.588211858121526, + "tokens_seen": 1197286400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218355065195587, + "loss": 2.7952, + "theoretical_loss": 3.5881937841352616, + "tokens_seen": 1197351936 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032182547642928787, + "loss": 2.8893, + "theoretical_loss": 3.588175711415211, + "tokens_seen": 1197417472 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032181544633901705, + "loss": 2.8416, + "theoretical_loss": 3.5881576399612145, + "tokens_seen": 1197483008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003218054162487463, + "loss": 2.8869, + "theoretical_loss": 3.5881395697731158, + "tokens_seen": 1197548544 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003217953861584754, + "loss": 2.8956, + "theoretical_loss": 3.5881215008507557, + "tokens_seen": 1197614080 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1918337, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.168741464614868, + "objective/train/theoretical_loss": 3.5881169838179194, + "objective/train/tokens_used": 1218090464, + "theoretical_loss": 3.5881169838179194, + "tokens_seen": 1197630464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032178535606820465, + "loss": 2.9521, + "theoretical_loss": 3.588103433193978, + "tokens_seen": 1197679616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003217753259779338, + "loss": 2.8764, + "theoretical_loss": 3.5880853668026225, + "tokens_seen": 1197745152 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321765295887663, + "loss": 2.7298, + "theoretical_loss": 3.588067301676533, + "tokens_seen": 1197810688 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003217552657973922, + "loss": 2.9325, + "theoretical_loss": 3.588049237815552, + "tokens_seen": 1197876224 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032174523570712137, + "loss": 2.9365, + "theoretical_loss": 3.5880311752195198, + "tokens_seen": 1197941760 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032173520561685055, + "loss": 2.8045, + "theoretical_loss": 3.5880131138882803, + "tokens_seen": 1198007296 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032172517552657974, + "loss": 2.9798, + "theoretical_loss": 3.5879950538216754, + "tokens_seen": 1198072832 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003217151454363089, + "loss": 2.7819, + "theoretical_loss": 3.5879769950195466, + "tokens_seen": 1198138368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032170511534603815, + "loss": 2.7643, + "theoretical_loss": 3.587958937481737, + "tokens_seen": 1198203904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216950852557673, + "loss": 2.8629, + "theoretical_loss": 3.5879408812080893, + "tokens_seen": 1198269440 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216850551654965, + "loss": 2.7365, + "theoretical_loss": 3.5879228261984446, + "tokens_seen": 1198334976 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216750250752257, + "loss": 2.8822, + "theoretical_loss": 3.5879047724526463, + "tokens_seen": 1198400512 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216649949849549, + "loss": 3.0555, + "theoretical_loss": 3.5878867199705367, + "tokens_seen": 1198466048 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032165496489468406, + "loss": 2.7348, + "theoretical_loss": 3.587868668751957, + "tokens_seen": 1198531584 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032164493480441324, + "loss": 2.773, + "theoretical_loss": 3.587850618796752, + "tokens_seen": 1198597120 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216349047141424, + "loss": 2.7882, + "theoretical_loss": 3.587832570104762, + "tokens_seen": 1198662656 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032162487462387165, + "loss": 2.9859, + "theoretical_loss": 3.5878145226758313, + "tokens_seen": 1198728192 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003216148445336008, + "loss": 2.9102, + "theoretical_loss": 3.5877964765098014, + "tokens_seen": 1198793728 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032160481444333, + "loss": 2.9279, + "theoretical_loss": 3.587778431606515, + "tokens_seen": 1198859264 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032159478435305914, + "loss": 2.8688, + "theoretical_loss": 3.587760387965815, + "tokens_seen": 1198924800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215847542627884, + "loss": 2.9355, + "theoretical_loss": 3.587742345587544, + "tokens_seen": 1198990336 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032157472417251756, + "loss": 2.638, + "theoretical_loss": 3.587724304471544, + "tokens_seen": 1199055872 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032156469408224674, + "loss": 2.8871, + "theoretical_loss": 3.587706264617659, + "tokens_seen": 1199121408 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215546639919759, + "loss": 2.8264, + "theoretical_loss": 3.5876882260257306, + "tokens_seen": 1199186944 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215446339017051, + "loss": 2.9555, + "theoretical_loss": 3.5876701886956024, + "tokens_seen": 1199252480 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1921136, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0983479022979736, + "objective/train/theoretical_loss": 3.5876656795602084, + "objective/train/tokens_used": 1219728864, + "theoretical_loss": 3.5876656795602084, + "tokens_seen": 1199268864 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215346038114343, + "loss": 2.935, + "theoretical_loss": 3.5876521526271166, + "tokens_seen": 1199318016 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215245737211635, + "loss": 2.9208, + "theoretical_loss": 3.587634117820116, + "tokens_seen": 1199383552 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032151454363089265, + "loss": 2.8766, + "theoretical_loss": 3.587616084274444, + "tokens_seen": 1199449088 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003215045135406219, + "loss": 2.9301, + "theoretical_loss": 3.587598051989943, + "tokens_seen": 1199514624 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032149448345035106, + "loss": 2.8986, + "theoretical_loss": 3.587580020966456, + "tokens_seen": 1199580160 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032148445336008024, + "loss": 2.8567, + "theoretical_loss": 3.5875619912038257, + "tokens_seen": 1199645696 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003214744232698094, + "loss": 2.9629, + "theoretical_loss": 3.5875439627018957, + "tokens_seen": 1199711232 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003214643931795386, + "loss": 2.7306, + "theoretical_loss": 3.5875259354605085, + "tokens_seen": 1199776768 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003214543630892678, + "loss": 2.849, + "theoretical_loss": 3.5875079094795073, + "tokens_seen": 1199842304 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321444332998997, + "loss": 2.9074, + "theoretical_loss": 3.587489884758735, + "tokens_seen": 1199907840 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032143430290872615, + "loss": 2.9091, + "theoretical_loss": 3.5874718612980345, + "tokens_seen": 1199973376 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003214242728184554, + "loss": 2.875, + "theoretical_loss": 3.5874538390972495, + "tokens_seen": 1200038912 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003214142427281845, + "loss": 2.9086, + "theoretical_loss": 3.5874358181562225, + "tokens_seen": 1200104448 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032140421263791375, + "loss": 2.9182, + "theoretical_loss": 3.587417798474797, + "tokens_seen": 1200169984 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032139418254764293, + "loss": 2.905, + "theoretical_loss": 3.587399780052816, + "tokens_seen": 1200235520 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213841524573721, + "loss": 2.969, + "theoretical_loss": 3.5873817628901232, + "tokens_seen": 1200301056 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213741223671013, + "loss": 2.8709, + "theoretical_loss": 3.5873637469865614, + "tokens_seen": 1200366592 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213640922768305, + "loss": 2.9676, + "theoretical_loss": 3.5873457323419737, + "tokens_seen": 1200432128 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032135406218655965, + "loss": 2.7907, + "theoretical_loss": 3.587327718956203, + "tokens_seen": 1200497664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213440320962889, + "loss": 2.7264, + "theoretical_loss": 3.5873097068290942, + "tokens_seen": 1200563200 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321334002006018, + "loss": 2.8994, + "theoretical_loss": 3.5872916959604897, + "tokens_seen": 1200628736 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032132397191574725, + "loss": 2.855, + "theoretical_loss": 3.5872736863502324, + "tokens_seen": 1200694272 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213139418254765, + "loss": 2.8288, + "theoretical_loss": 3.587255677998167, + "tokens_seen": 1200759808 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003213039117352056, + "loss": 2.8468, + "theoretical_loss": 3.587237670904135, + "tokens_seen": 1200825344 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032129388164493485, + "loss": 2.7549, + "theoretical_loss": 3.5872196650679813, + "tokens_seen": 1200890880 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1923865, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.884381055831909, + "objective/train/theoretical_loss": 3.5872151638054683, + "objective/train/tokens_used": 1221367264, + "theoretical_loss": 3.5872151638054683, + "tokens_seen": 1200907264 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321283851554664, + "loss": 2.936, + "theoretical_loss": 3.5872016604895496, + "tokens_seen": 1200956416 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003212738214643932, + "loss": 2.902, + "theoretical_loss": 3.5871836571686826, + "tokens_seen": 1201021952 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003212637913741224, + "loss": 2.9556, + "theoretical_loss": 3.587165655105224, + "tokens_seen": 1201087488 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003212537612838516, + "loss": 2.9648, + "theoretical_loss": 3.5871476542990175, + "tokens_seen": 1201153024 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032124373119358075, + "loss": 2.9206, + "theoretical_loss": 3.587129654749907, + "tokens_seen": 1201218560 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032123370110330994, + "loss": 2.6488, + "theoretical_loss": 3.587111656457736, + "tokens_seen": 1201284096 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003212236710130391, + "loss": 2.8532, + "theoretical_loss": 3.5870936594223477, + "tokens_seen": 1201349632 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032121364092276835, + "loss": 2.8611, + "theoretical_loss": 3.5870756636435863, + "tokens_seen": 1201415168 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003212036108324975, + "loss": 3.0622, + "theoretical_loss": 3.587057669121295, + "tokens_seen": 1201480704 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003211935807422267, + "loss": 2.8742, + "theoretical_loss": 3.5870396758553182, + "tokens_seen": 1201546240 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003211835506519559, + "loss": 2.6923, + "theoretical_loss": 3.5870216838454994, + "tokens_seen": 1201611776 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003211735205616851, + "loss": 3.0394, + "theoretical_loss": 3.587003693091682, + "tokens_seen": 1201677312 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032116349047141426, + "loss": 2.7935, + "theoretical_loss": 3.5869857035937107, + "tokens_seen": 1201742848 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032115346038114344, + "loss": 2.7928, + "theoretical_loss": 3.5869677153514283, + "tokens_seen": 1201808384 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003211434302908726, + "loss": 2.6488, + "theoretical_loss": 3.58694972836468, + "tokens_seen": 1201873920 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032113340020060185, + "loss": 2.8384, + "theoretical_loss": 3.5869317426333085, + "tokens_seen": 1201939456 + }, + { + "epoch": 3.05, + "learning_rate": 0.000321123370110331, + "loss": 2.8466, + "theoretical_loss": 3.5869137581571575, + "tokens_seen": 1202004992 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003211133400200602, + "loss": 2.8561, + "theoretical_loss": 3.5868957749360724, + "tokens_seen": 1202070528 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032110330992978934, + "loss": 2.9602, + "theoretical_loss": 3.5868777929698963, + "tokens_seen": 1202136064 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210932798395186, + "loss": 2.801, + "theoretical_loss": 3.5868598122584734, + "tokens_seen": 1202201600 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032108324974924776, + "loss": 2.9222, + "theoretical_loss": 3.586841832801648, + "tokens_seen": 1202267136 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032107321965897694, + "loss": 2.8307, + "theoretical_loss": 3.5868238545992632, + "tokens_seen": 1202332672 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210631895687061, + "loss": 2.7915, + "theoretical_loss": 3.586805877651164, + "tokens_seen": 1202398208 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210531594784353, + "loss": 2.7258, + "theoretical_loss": 3.5867879019571944, + "tokens_seen": 1202463744 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210431293881645, + "loss": 2.7715, + "theoretical_loss": 3.5867699275171985, + "tokens_seen": 1202529280 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1926800, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7117104530334473, + "objective/train/theoretical_loss": 3.586765434103115, + "objective/train/tokens_used": 1223005664, + "theoretical_loss": 3.586765434103115, + "tokens_seen": 1202545664 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210330992978937, + "loss": 2.8755, + "theoretical_loss": 3.586751954331021, + "tokens_seen": 1202594816 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032102306920762285, + "loss": 2.6384, + "theoretical_loss": 3.586733982398505, + "tokens_seen": 1202660352 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003210130391173521, + "loss": 2.9304, + "theoretical_loss": 3.5867160117194956, + "tokens_seen": 1202725888 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032100300902708126, + "loss": 2.8171, + "theoretical_loss": 3.5866980422938366, + "tokens_seen": 1202791424 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032099297893681044, + "loss": 2.8964, + "theoretical_loss": 3.5866800741213725, + "tokens_seen": 1202856960 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003209829488465396, + "loss": 3.0444, + "theoretical_loss": 3.586662107201948, + "tokens_seen": 1202922496 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003209729187562688, + "loss": 2.8282, + "theoretical_loss": 3.586644141535407, + "tokens_seen": 1202988032 + }, + { + "epoch": 3.05, + "learning_rate": 0.000320962888665998, + "loss": 2.8029, + "theoretical_loss": 3.5866261771215937, + "tokens_seen": 1203053568 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003209528585757272, + "loss": 2.977, + "theoretical_loss": 3.5866082139603535, + "tokens_seen": 1203119104 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032094282848545635, + "loss": 3.0489, + "theoretical_loss": 3.5865902520515296, + "tokens_seen": 1203184640 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003209327983951856, + "loss": 2.89, + "theoretical_loss": 3.586572291394967, + "tokens_seen": 1203250176 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003209227683049147, + "loss": 2.8998, + "theoretical_loss": 3.58655433199051, + "tokens_seen": 1203315712 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032091273821464395, + "loss": 2.8582, + "theoretical_loss": 3.586536373838004, + "tokens_seen": 1203381248 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032090270812437313, + "loss": 2.7408, + "theoretical_loss": 3.586518416937293, + "tokens_seen": 1203446784 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208926780341023, + "loss": 2.9456, + "theoretical_loss": 3.586500461288221, + "tokens_seen": 1203512320 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208826479438315, + "loss": 2.7926, + "theoretical_loss": 3.586482506890633, + "tokens_seen": 1203577856 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208726178535607, + "loss": 2.8599, + "theoretical_loss": 3.5864645537443742, + "tokens_seen": 1203643392 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032086258776328985, + "loss": 2.7073, + "theoretical_loss": 3.586446601849288, + "tokens_seen": 1203708928 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208525576730191, + "loss": 2.8298, + "theoretical_loss": 3.586428651205221, + "tokens_seen": 1203774464 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208425275827482, + "loss": 2.8042, + "theoretical_loss": 3.5864107018120164, + "tokens_seen": 1203840000 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032083249749247745, + "loss": 2.9901, + "theoretical_loss": 3.586392753669519, + "tokens_seen": 1203905536 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032082246740220663, + "loss": 2.9282, + "theoretical_loss": 3.586374806777574, + "tokens_seen": 1203971072 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003208124373119358, + "loss": 3.0186, + "theoretical_loss": 3.5863568611360264, + "tokens_seen": 1204036608 + }, + { + "epoch": 3.05, + "learning_rate": 0.000320802407221665, + "loss": 2.8312, + "theoretical_loss": 3.5863389167447206, + "tokens_seen": 1204102144 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207923771313942, + "loss": 3.0555, + "theoretical_loss": 3.5863209736035015, + "tokens_seen": 1204167680 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1929726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9264025688171387, + "objective/train/theoretical_loss": 3.5863164880135043, + "objective/train/tokens_used": 1224644064, + "theoretical_loss": 3.5863164880135043, + "tokens_seen": 1204184064 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032078234704112336, + "loss": 2.9715, + "theoretical_loss": 3.586303031712214, + "tokens_seen": 1204233216 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207723169508526, + "loss": 2.7972, + "theoretical_loss": 3.5862850910707036, + "tokens_seen": 1204298752 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207622868605817, + "loss": 2.911, + "theoretical_loss": 3.5862671516788143, + "tokens_seen": 1204364288 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032075225677031095, + "loss": 3.0076, + "theoretical_loss": 3.5862492135363917, + "tokens_seen": 1204429824 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207422266800401, + "loss": 2.6159, + "theoretical_loss": 3.5862312766432805, + "tokens_seen": 1204495360 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207321965897693, + "loss": 2.8924, + "theoretical_loss": 3.586213340999326, + "tokens_seen": 1204560896 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207221664994985, + "loss": 2.7192, + "theoretical_loss": 3.5861954066043733, + "tokens_seen": 1204626432 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003207121364092277, + "loss": 2.8092, + "theoretical_loss": 3.5861774734582665, + "tokens_seen": 1204691968 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032070210631895686, + "loss": 3.0244, + "theoretical_loss": 3.5861595415608525, + "tokens_seen": 1204757504 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206920762286861, + "loss": 2.8425, + "theoretical_loss": 3.586141610911975, + "tokens_seen": 1204823040 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206820461384152, + "loss": 2.8455, + "theoretical_loss": 3.586123681511479, + "tokens_seen": 1204888576 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032067201604814446, + "loss": 2.8867, + "theoretical_loss": 3.586105753359211, + "tokens_seen": 1204954112 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206619859578736, + "loss": 2.8721, + "theoretical_loss": 3.586087826455015, + "tokens_seen": 1205019648 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206519558676028, + "loss": 2.9107, + "theoretical_loss": 3.586069900798737, + "tokens_seen": 1205085184 + }, + { + "epoch": 3.05, + "learning_rate": 0.000320641925777332, + "loss": 3.0082, + "theoretical_loss": 3.5860519763902223, + "tokens_seen": 1205150720 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206318956870612, + "loss": 3.0471, + "theoretical_loss": 3.5860340532293153, + "tokens_seen": 1205216256 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032062186559679036, + "loss": 2.8733, + "theoretical_loss": 3.586016131315862, + "tokens_seen": 1205281792 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032061183550651954, + "loss": 2.8845, + "theoretical_loss": 3.5859982106497084, + "tokens_seen": 1205347328 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003206018054162487, + "loss": 2.8972, + "theoretical_loss": 3.5859802912306984, + "tokens_seen": 1205412864 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032059177532597796, + "loss": 3.0188, + "theoretical_loss": 3.5859623730586785, + "tokens_seen": 1205478400 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205817452357071, + "loss": 2.9756, + "theoretical_loss": 3.585944456133493, + "tokens_seen": 1205543936 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205717151454363, + "loss": 2.8094, + "theoretical_loss": 3.5859265404549894, + "tokens_seen": 1205609472 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205616850551655, + "loss": 2.8639, + "theoretical_loss": 3.5859086260230115, + "tokens_seen": 1205675008 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205516549648947, + "loss": 2.8616, + "theoretical_loss": 3.585890712837405, + "tokens_seen": 1205740544 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205416248746239, + "loss": 2.9278, + "theoretical_loss": 3.585872800898016, + "tokens_seen": 1205806080 + }, + { + "epoch": 3.05, + "objective/train/docs_used": 1932020, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8131229877471924, + "objective/train/theoretical_loss": 3.585868323107872, + "objective/train/tokens_used": 1226282464, + "theoretical_loss": 3.585868323107872, + "tokens_seen": 1205822464 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032053159478435305, + "loss": 2.8779, + "theoretical_loss": 3.58585489020469, + "tokens_seen": 1205871616 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003205215646940823, + "loss": 2.854, + "theoretical_loss": 3.5858369807572723, + "tokens_seen": 1205937152 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032051153460381146, + "loss": 2.6708, + "theoretical_loss": 3.5858190725556085, + "tokens_seen": 1206002688 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032050150451354064, + "loss": 2.8359, + "theoretical_loss": 3.5858011655995443, + "tokens_seen": 1206068224 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003204914744232698, + "loss": 2.7562, + "theoretical_loss": 3.5857832598889257, + "tokens_seen": 1206133760 + }, + { + "epoch": 3.05, + "learning_rate": 0.000320481444332999, + "loss": 2.8042, + "theoretical_loss": 3.5857653554235984, + "tokens_seen": 1206199296 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003204714142427282, + "loss": 2.8831, + "theoretical_loss": 3.5857474522034076, + "tokens_seen": 1206264832 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003204613841524574, + "loss": 2.7865, + "theoretical_loss": 3.5857295502281996, + "tokens_seen": 1206330368 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032045135406218655, + "loss": 3.0341, + "theoretical_loss": 3.58571164949782, + "tokens_seen": 1206395904 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003204413239719158, + "loss": 2.8056, + "theoretical_loss": 3.585693750012114, + "tokens_seen": 1206461440 + }, + { + "epoch": 3.05, + "learning_rate": 0.0003204312938816449, + "loss": 2.9969, + "theoretical_loss": 3.5856758517709286, + "tokens_seen": 1206526976 + }, + { + "epoch": 3.05, + "learning_rate": 0.00032042126379137415, + "loss": 2.8673, + "theoretical_loss": 3.58565795477411, + "tokens_seen": 1206592512 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032041123370110333, + "loss": 2.837, + "theoretical_loss": 3.585640059021502, + "tokens_seen": 1206658048 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003204012036108325, + "loss": 2.8816, + "theoretical_loss": 3.5856221645129525, + "tokens_seen": 1206723584 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203911735205617, + "loss": 2.6655, + "theoretical_loss": 3.5856042712483065, + "tokens_seen": 1206789120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203811434302909, + "loss": 2.8335, + "theoretical_loss": 3.58558637922741, + "tokens_seen": 1206854656 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032037111334002005, + "loss": 3.0126, + "theoretical_loss": 3.5855684884501096, + "tokens_seen": 1206920192 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203610832497493, + "loss": 2.9142, + "theoretical_loss": 3.585550598916251, + "tokens_seen": 1206985728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203510531594784, + "loss": 3.0389, + "theoretical_loss": 3.5855327106256802, + "tokens_seen": 1207051264 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032034102306920765, + "loss": 2.6842, + "theoretical_loss": 3.5855148235782437, + "tokens_seen": 1207116800 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032033099297893683, + "loss": 2.8259, + "theoretical_loss": 3.5854969377737866, + "tokens_seen": 1207182336 + }, + { + "epoch": 3.06, + "learning_rate": 0.000320320962888666, + "loss": 2.8702, + "theoretical_loss": 3.5854790532121563, + "tokens_seen": 1207247872 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203109327983952, + "loss": 2.9872, + "theoretical_loss": 3.585461169893198, + "tokens_seen": 1207313408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003203009027081244, + "loss": 2.7442, + "theoretical_loss": 3.5854432878167586, + "tokens_seen": 1207378944 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032029087261785356, + "loss": 2.9868, + "theoretical_loss": 3.5854254069826843, + "tokens_seen": 1207444480 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1934942, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0421640872955322, + "objective/train/theoretical_loss": 3.5854209369682675, + "objective/train/tokens_used": 1227920864, + "theoretical_loss": 3.5854209369682675, + "tokens_seen": 1207460864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202808425275828, + "loss": 2.9746, + "theoretical_loss": 3.585407527390821, + "tokens_seen": 1207510016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202708124373119, + "loss": 2.8248, + "theoretical_loss": 3.585389649041015, + "tokens_seen": 1207575552 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032026078234704115, + "loss": 2.891, + "theoretical_loss": 3.5853717719331124, + "tokens_seen": 1207641088 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202507522567703, + "loss": 2.9793, + "theoretical_loss": 3.5853538960669606, + "tokens_seen": 1207706624 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202407221664995, + "loss": 2.9125, + "theoretical_loss": 3.585336021442405, + "tokens_seen": 1207772160 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202306920762287, + "loss": 2.9131, + "theoretical_loss": 3.585318148059292, + "tokens_seen": 1207837696 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202206619859579, + "loss": 2.9992, + "theoretical_loss": 3.5853002759174686, + "tokens_seen": 1207903232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032021063189568706, + "loss": 2.9926, + "theoretical_loss": 3.585282405016781, + "tokens_seen": 1207968768 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003202006018054163, + "loss": 2.7035, + "theoretical_loss": 3.585264535357075, + "tokens_seen": 1208034304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003201905717151454, + "loss": 2.9776, + "theoretical_loss": 3.585246666938198, + "tokens_seen": 1208099840 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032018054162487466, + "loss": 2.8797, + "theoretical_loss": 3.5852287997599968, + "tokens_seen": 1208165376 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003201705115346038, + "loss": 2.905, + "theoretical_loss": 3.585210933822317, + "tokens_seen": 1208230912 + }, + { + "epoch": 3.06, + "learning_rate": 0.000320160481444333, + "loss": 2.9415, + "theoretical_loss": 3.5851930691250056, + "tokens_seen": 1208296448 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003201504513540622, + "loss": 2.7845, + "theoretical_loss": 3.585175205667909, + "tokens_seen": 1208361984 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003201404212637914, + "loss": 3.0239, + "theoretical_loss": 3.5851573434508746, + "tokens_seen": 1208427520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032013039117352056, + "loss": 2.9391, + "theoretical_loss": 3.585139482473748, + "tokens_seen": 1208493056 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032012036108324974, + "loss": 2.7719, + "theoretical_loss": 3.5851216227363767, + "tokens_seen": 1208558592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003201103309929789, + "loss": 2.9227, + "theoretical_loss": 3.585103764238607, + "tokens_seen": 1208624128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032010030090270816, + "loss": 2.9642, + "theoretical_loss": 3.585085906980286, + "tokens_seen": 1208689664 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003200902708124373, + "loss": 2.6818, + "theoretical_loss": 3.5850680509612607, + "tokens_seen": 1208755200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003200802407221665, + "loss": 2.8337, + "theoretical_loss": 3.5850501961813768, + "tokens_seen": 1208820736 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032007021063189565, + "loss": 2.7632, + "theoretical_loss": 3.585032342640482, + "tokens_seen": 1208886272 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003200601805416249, + "loss": 2.7964, + "theoretical_loss": 3.5850144903384233, + "tokens_seen": 1208951808 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032005015045135407, + "loss": 2.863, + "theoretical_loss": 3.584996639275047, + "tokens_seen": 1209017344 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032004012036108325, + "loss": 2.8953, + "theoretical_loss": 3.5849787894502003, + "tokens_seen": 1209082880 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1937725, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8035290241241455, + "objective/train/theoretical_loss": 3.584974327187491, + "objective/train/tokens_used": 1229559264, + "theoretical_loss": 3.584974327187491, + "tokens_seen": 1209099264 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032003009027081243, + "loss": 2.7861, + "theoretical_loss": 3.5849609408637306, + "tokens_seen": 1209148416 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032002006018054166, + "loss": 2.7586, + "theoretical_loss": 3.5849430935154842, + "tokens_seen": 1209213952 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003200100300902708, + "loss": 2.9096, + "theoretical_loss": 3.584925247405308, + "tokens_seen": 1209279488 + }, + { + "epoch": 3.06, + "learning_rate": 0.00032, + "loss": 2.8159, + "theoretical_loss": 3.5849074025330494, + "tokens_seen": 1209345024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031998996990972915, + "loss": 2.9302, + "theoretical_loss": 3.5848895588985554, + "tokens_seen": 1209410560 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003199799398194584, + "loss": 2.9234, + "theoretical_loss": 3.5848717165016737, + "tokens_seen": 1209476096 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031996990972918757, + "loss": 2.781, + "theoretical_loss": 3.58485387534225, + "tokens_seen": 1209541632 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031995987963891675, + "loss": 2.9562, + "theoretical_loss": 3.5848360354201327, + "tokens_seen": 1209607168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031994984954864593, + "loss": 2.7964, + "theoretical_loss": 3.5848181967351684, + "tokens_seen": 1209672704 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003199398194583751, + "loss": 2.8484, + "theoretical_loss": 3.584800359287205, + "tokens_seen": 1209738240 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003199297893681043, + "loss": 2.8882, + "theoretical_loss": 3.584782523076088, + "tokens_seen": 1209803776 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031991975927783353, + "loss": 2.7774, + "theoretical_loss": 3.584764688101666, + "tokens_seen": 1209869312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031990972918756266, + "loss": 2.7681, + "theoretical_loss": 3.5847468543637864, + "tokens_seen": 1209934848 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003198996990972919, + "loss": 3.0007, + "theoretical_loss": 3.5847290218622962, + "tokens_seen": 1210000384 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319889669007021, + "loss": 2.9102, + "theoretical_loss": 3.584711190597042, + "tokens_seen": 1210065920 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031987963891675025, + "loss": 2.9177, + "theoretical_loss": 3.584693360567872, + "tokens_seen": 1210131456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031986960882647943, + "loss": 2.8221, + "theoretical_loss": 3.584675531774634, + "tokens_seen": 1210196992 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003198595787362086, + "loss": 2.8199, + "theoretical_loss": 3.584657704217174, + "tokens_seen": 1210262528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003198495486459378, + "loss": 2.8679, + "theoretical_loss": 3.5846398778953406, + "tokens_seen": 1210328064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031983951855566703, + "loss": 2.7629, + "theoretical_loss": 3.5846220528089807, + "tokens_seen": 1210393600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031982948846539616, + "loss": 2.8973, + "theoretical_loss": 3.584604228957942, + "tokens_seen": 1210459136 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003198194583751254, + "loss": 2.8721, + "theoretical_loss": 3.584586406342072, + "tokens_seen": 1210524672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003198094282848546, + "loss": 2.8879, + "theoretical_loss": 3.584568584961218, + "tokens_seen": 1210590208 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031979939819458376, + "loss": 2.9151, + "theoretical_loss": 3.584550764815228, + "tokens_seen": 1210655744 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319789368104313, + "loss": 2.9602, + "theoretical_loss": 3.584532945903949, + "tokens_seen": 1210721280 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1939010, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.471855640411377, + "objective/train/theoretical_loss": 3.584528491369035, + "objective/train/tokens_used": 1231197664, + "theoretical_loss": 3.584528491369035, + "tokens_seen": 1210737664 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197793380140421, + "loss": 2.7433, + "theoretical_loss": 3.5845151282272294, + "tokens_seen": 1210786816 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031976930792377135, + "loss": 2.9143, + "theoretical_loss": 3.584497311784916, + "tokens_seen": 1210852352 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197592778335005, + "loss": 2.8889, + "theoretical_loss": 3.584479496576857, + "tokens_seen": 1210917888 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197492477432297, + "loss": 2.879, + "theoretical_loss": 3.5844616826029005, + "tokens_seen": 1210983424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197392176529589, + "loss": 2.9055, + "theoretical_loss": 3.5844438698628935, + "tokens_seen": 1211048960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197291875626881, + "loss": 2.9732, + "theoretical_loss": 3.5844260583566836, + "tokens_seen": 1211114496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031971915747241726, + "loss": 3.007, + "theoretical_loss": 3.5844082480841197, + "tokens_seen": 1211180032 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003197091273821465, + "loss": 2.9222, + "theoretical_loss": 3.5843904390450483, + "tokens_seen": 1211245568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003196990972918756, + "loss": 2.9542, + "theoretical_loss": 3.5843726312393183, + "tokens_seen": 1211311104 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031968906720160486, + "loss": 2.8498, + "theoretical_loss": 3.584354824666777, + "tokens_seen": 1211376640 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319679037111334, + "loss": 2.8841, + "theoretical_loss": 3.584337019327272, + "tokens_seen": 1211442176 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003196690070210632, + "loss": 2.7451, + "theoretical_loss": 3.5843192152206518, + "tokens_seen": 1211507712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003196589769307924, + "loss": 2.9334, + "theoretical_loss": 3.5843014123467647, + "tokens_seen": 1211573248 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003196489468405216, + "loss": 2.851, + "theoretical_loss": 3.5842836107054574, + "tokens_seen": 1211638784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031963891675025076, + "loss": 2.9276, + "theoretical_loss": 3.584265810296579, + "tokens_seen": 1211704320 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031962888665997994, + "loss": 2.8951, + "theoretical_loss": 3.584248011119977, + "tokens_seen": 1211769856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003196188565697091, + "loss": 2.7727, + "theoretical_loss": 3.5842302131755, + "tokens_seen": 1211835392 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031960882647943836, + "loss": 2.9023, + "theoretical_loss": 3.584212416462995, + "tokens_seen": 1211900928 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003195987963891675, + "loss": 3.01, + "theoretical_loss": 3.5841946209823115, + "tokens_seen": 1211966464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003195887662988967, + "loss": 2.8021, + "theoretical_loss": 3.5841768267332967, + "tokens_seen": 1212032000 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031957873620862585, + "loss": 2.8812, + "theoretical_loss": 3.5841590337157987, + "tokens_seen": 1212097536 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003195687061183551, + "loss": 2.9523, + "theoretical_loss": 3.5841412419296663, + "tokens_seen": 1212163072 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031955867602808427, + "loss": 2.6062, + "theoretical_loss": 3.5841234513747477, + "tokens_seen": 1212228608 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031954864593781345, + "loss": 2.9106, + "theoretical_loss": 3.5841056620508907, + "tokens_seen": 1212294144 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031953861584754263, + "loss": 2.8387, + "theoretical_loss": 3.5840878739579436, + "tokens_seen": 1212359680 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1941813, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5740745067596436, + "objective/train/theoretical_loss": 3.5840834271270188, + "objective/train/tokens_used": 1232836064, + "theoretical_loss": 3.5840834271270188, + "tokens_seen": 1212376064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031952858575727186, + "loss": 2.8158, + "theoretical_loss": 3.5840700870957547, + "tokens_seen": 1212425216 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319518555667001, + "loss": 2.7728, + "theoretical_loss": 3.5840523014641725, + "tokens_seen": 1212490752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003195085255767302, + "loss": 2.8453, + "theoretical_loss": 3.5840345170630457, + "tokens_seen": 1212556288 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031949849548645935, + "loss": 2.8394, + "theoretical_loss": 3.5840167338922218, + "tokens_seen": 1212621824 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003194884653961886, + "loss": 2.9378, + "theoretical_loss": 3.58399895195155, + "tokens_seen": 1212687360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031947843530591777, + "loss": 2.8158, + "theoretical_loss": 3.583981171240878, + "tokens_seen": 1212752896 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031946840521564695, + "loss": 2.7478, + "theoretical_loss": 3.583963391760055, + "tokens_seen": 1212818432 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031945837512537613, + "loss": 2.8528, + "theoretical_loss": 3.583945613508929, + "tokens_seen": 1212883968 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003194483450351053, + "loss": 2.9262, + "theoretical_loss": 3.583927836487349, + "tokens_seen": 1212949504 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003194383149448345, + "loss": 2.9274, + "theoretical_loss": 3.583910060695162, + "tokens_seen": 1213015040 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031942828485456373, + "loss": 2.9735, + "theoretical_loss": 3.583892286132219, + "tokens_seen": 1213080576 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031941825476429286, + "loss": 2.8527, + "theoretical_loss": 3.583874512798367, + "tokens_seen": 1213146112 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003194082246740221, + "loss": 2.8894, + "theoretical_loss": 3.583856740693455, + "tokens_seen": 1213211648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003193981945837512, + "loss": 2.8675, + "theoretical_loss": 3.5838389698173314, + "tokens_seen": 1213277184 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031938816449348045, + "loss": 2.8234, + "theoretical_loss": 3.5838212001698455, + "tokens_seen": 1213342720 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031937813440320964, + "loss": 2.8018, + "theoretical_loss": 3.5838034317508454, + "tokens_seen": 1213408256 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003193681043129388, + "loss": 2.8766, + "theoretical_loss": 3.5837856645601804, + "tokens_seen": 1213473792 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319358074222668, + "loss": 2.6683, + "theoretical_loss": 3.5837678985976984, + "tokens_seen": 1213539328 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031934804413239723, + "loss": 2.8217, + "theoretical_loss": 3.5837501338632487, + "tokens_seen": 1213604864 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031933801404212636, + "loss": 2.9312, + "theoretical_loss": 3.583732370356681, + "tokens_seen": 1213670400 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003193279839518556, + "loss": 2.9105, + "theoretical_loss": 3.583714608077842, + "tokens_seen": 1213735936 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003193179538615847, + "loss": 2.7371, + "theoretical_loss": 3.583696847026582, + "tokens_seen": 1213801472 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031930792377131396, + "loss": 2.7732, + "theoretical_loss": 3.58367908720275, + "tokens_seen": 1213867008 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031929789368104314, + "loss": 2.8432, + "theoretical_loss": 3.583661328606194, + "tokens_seen": 1213932544 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192878635907723, + "loss": 2.884, + "theoretical_loss": 3.583643571236764, + "tokens_seen": 1213998080 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1944383, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.62280011177063, + "objective/train/theoretical_loss": 3.583639132086127, + "objective/train/tokens_used": 1234474464, + "theoretical_loss": 3.583639132086127, + "tokens_seen": 1214014464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192778335005015, + "loss": 2.8795, + "theoretical_loss": 3.583625815094308, + "tokens_seen": 1214063616 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192678034102307, + "loss": 2.9734, + "theoretical_loss": 3.583608060178676, + "tokens_seen": 1214129152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031925777331995986, + "loss": 2.9057, + "theoretical_loss": 3.5835903064897163, + "tokens_seen": 1214194688 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192477432296891, + "loss": 2.9103, + "theoretical_loss": 3.5835725540272785, + "tokens_seen": 1214260224 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192377131394182, + "loss": 2.9276, + "theoretical_loss": 3.5835548027912107, + "tokens_seen": 1214325760 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031922768304914746, + "loss": 2.8774, + "theoretical_loss": 3.583537052781363, + "tokens_seen": 1214391296 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192176529588766, + "loss": 2.8349, + "theoretical_loss": 3.5835193039975834, + "tokens_seen": 1214456832 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003192076228686058, + "loss": 2.8068, + "theoretical_loss": 3.583501556439723, + "tokens_seen": 1214522368 + }, + { + "epoch": 3.06, + "learning_rate": 0.000319197592778335, + "loss": 2.8816, + "theoretical_loss": 3.5834838101076287, + "tokens_seen": 1214587904 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191875626880642, + "loss": 2.8952, + "theoretical_loss": 3.583466065001151, + "tokens_seen": 1214653440 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031917753259779337, + "loss": 2.8588, + "theoretical_loss": 3.583448321120139, + "tokens_seen": 1214718976 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191675025075226, + "loss": 2.8284, + "theoretical_loss": 3.5834305784644425, + "tokens_seen": 1214784512 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031915747241725173, + "loss": 3.0092, + "theoretical_loss": 3.58341283703391, + "tokens_seen": 1214850048 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031914744232698096, + "loss": 2.8528, + "theoretical_loss": 3.58339509682839, + "tokens_seen": 1214915584 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191374122367101, + "loss": 2.9045, + "theoretical_loss": 3.5833773578477333, + "tokens_seen": 1214981120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191273821464393, + "loss": 2.9087, + "theoretical_loss": 3.583359620091789, + "tokens_seen": 1215046656 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191173520561685, + "loss": 2.8861, + "theoretical_loss": 3.583341883560406, + "tokens_seen": 1215112192 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003191073219658977, + "loss": 2.8946, + "theoretical_loss": 3.5833241482534346, + "tokens_seen": 1215177728 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031909729187562687, + "loss": 2.8311, + "theoretical_loss": 3.583306414170723, + "tokens_seen": 1215243264 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031908726178535605, + "loss": 2.9139, + "theoretical_loss": 3.583288681312122, + "tokens_seen": 1215308800 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031907723169508523, + "loss": 2.8221, + "theoretical_loss": 3.58327094967748, + "tokens_seen": 1215374336 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031906720160481447, + "loss": 2.8982, + "theoretical_loss": 3.5832532192666466, + "tokens_seen": 1215439872 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031905717151454365, + "loss": 2.977, + "theoretical_loss": 3.5832354900794723, + "tokens_seen": 1215505408 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031904714142427283, + "loss": 2.7867, + "theoretical_loss": 3.583217762115806, + "tokens_seen": 1215570944 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031903711133400206, + "loss": 2.8698, + "theoretical_loss": 3.583200035375497, + "tokens_seen": 1215636480 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1947233, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0140342712402344, + "objective/train/theoretical_loss": 3.583195603881552, + "objective/train/tokens_used": 1236112864, + "theoretical_loss": 3.583195603881552, + "tokens_seen": 1215652864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003190270812437312, + "loss": 2.8901, + "theoretical_loss": 3.5831823098583957, + "tokens_seen": 1215702016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003190170511534604, + "loss": 2.897, + "theoretical_loss": 3.583164585564351, + "tokens_seen": 1215767552 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031900702106318955, + "loss": 2.9239, + "theoretical_loss": 3.583146862493213, + "tokens_seen": 1215833088 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003189969909729188, + "loss": 2.7522, + "theoretical_loss": 3.5831291406448322, + "tokens_seen": 1215898624 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031898696088264797, + "loss": 2.7595, + "theoretical_loss": 3.5831114200190566, + "tokens_seen": 1215964160 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031897693079237715, + "loss": 2.8583, + "theoretical_loss": 3.5830937006157373, + "tokens_seen": 1216029696 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031896690070210633, + "loss": 2.838, + "theoretical_loss": 3.583075982434724, + "tokens_seen": 1216095232 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003189568706118355, + "loss": 2.9399, + "theoretical_loss": 3.5830582654758656, + "tokens_seen": 1216160768 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003189468405215647, + "loss": 2.7261, + "theoretical_loss": 3.583040549739013, + "tokens_seen": 1216226304 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031893681043129393, + "loss": 2.7365, + "theoretical_loss": 3.583022835224015, + "tokens_seen": 1216291840 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031892678034102306, + "loss": 2.9833, + "theoretical_loss": 3.583005121930723, + "tokens_seen": 1216357376 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003189167502507523, + "loss": 2.9159, + "theoretical_loss": 3.5829874098589856, + "tokens_seen": 1216422912 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003189067201604814, + "loss": 2.9492, + "theoretical_loss": 3.582969699008653, + "tokens_seen": 1216488448 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031889669007021065, + "loss": 2.7992, + "theoretical_loss": 3.5829519893795756, + "tokens_seen": 1216553984 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031888665997993984, + "loss": 2.8316, + "theoretical_loss": 3.582934280971603, + "tokens_seen": 1216619520 + }, + { + "epoch": 3.06, + "learning_rate": 0.000318876629889669, + "loss": 2.7174, + "theoretical_loss": 3.582916573784586, + "tokens_seen": 1216685056 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003188665997993982, + "loss": 2.7136, + "theoretical_loss": 3.5828988678183737, + "tokens_seen": 1216750592 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031885656970912743, + "loss": 2.656, + "theoretical_loss": 3.5828811630728166, + "tokens_seen": 1216816128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031884653961885656, + "loss": 2.91, + "theoretical_loss": 3.582863459547765, + "tokens_seen": 1216881664 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003188365095285858, + "loss": 2.7174, + "theoretical_loss": 3.582845757243068, + "tokens_seen": 1216947200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003188264794383149, + "loss": 2.8887, + "theoretical_loss": 3.5828280561585775, + "tokens_seen": 1217012736 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031881644934804416, + "loss": 2.8639, + "theoretical_loss": 3.582810356294143, + "tokens_seen": 1217078272 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031880641925777334, + "loss": 2.9013, + "theoretical_loss": 3.5827926576496134, + "tokens_seen": 1217143808 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187963891675025, + "loss": 2.8156, + "theoretical_loss": 3.5827749602248407, + "tokens_seen": 1217209344 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187863590772317, + "loss": 2.8058, + "theoretical_loss": 3.5827572640196745, + "tokens_seen": 1217274880 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1949834, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8453214168548584, + "objective/train/theoretical_loss": 3.582752840158929, + "objective/train/tokens_used": 1237751264, + "theoretical_loss": 3.582752840158929, + "tokens_seen": 1217291264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187763289869609, + "loss": 2.97, + "theoretical_loss": 3.582739569033965, + "tokens_seen": 1217340416 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031876629889669006, + "loss": 2.9888, + "theoretical_loss": 3.5827218752675627, + "tokens_seen": 1217405952 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187562688064193, + "loss": 2.9664, + "theoretical_loss": 3.5827041827203177, + "tokens_seen": 1217471488 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187462387161484, + "loss": 2.987, + "theoretical_loss": 3.5826864913920806, + "tokens_seen": 1217537024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031873620862587766, + "loss": 2.9582, + "theoretical_loss": 3.582668801282702, + "tokens_seen": 1217602560 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187261785356068, + "loss": 2.9874, + "theoretical_loss": 3.582651112392032, + "tokens_seen": 1217668096 + }, + { + "epoch": 3.06, + "learning_rate": 0.000318716148445336, + "loss": 2.9246, + "theoretical_loss": 3.582633424719921, + "tokens_seen": 1217733632 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003187061183550652, + "loss": 2.8568, + "theoretical_loss": 3.5826157382662194, + "tokens_seen": 1217799168 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186960882647944, + "loss": 2.7404, + "theoretical_loss": 3.582598053030778, + "tokens_seen": 1217864704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031868605817452357, + "loss": 2.9422, + "theoretical_loss": 3.582580369013448, + "tokens_seen": 1217930240 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186760280842528, + "loss": 2.9263, + "theoretical_loss": 3.5825626862140787, + "tokens_seen": 1217995776 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031866599799398193, + "loss": 2.8018, + "theoretical_loss": 3.582545004632521, + "tokens_seen": 1218061312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031865596790371116, + "loss": 2.7757, + "theoretical_loss": 3.582527324268626, + "tokens_seen": 1218126848 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186459378134403, + "loss": 2.9202, + "theoretical_loss": 3.582509645122244, + "tokens_seen": 1218192384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186359077231695, + "loss": 2.9328, + "theoretical_loss": 3.5824919671932256, + "tokens_seen": 1218257920 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186258776328987, + "loss": 2.8844, + "theoretical_loss": 3.5824742904814215, + "tokens_seen": 1218323456 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003186158475426279, + "loss": 2.7141, + "theoretical_loss": 3.582456614986683, + "tokens_seen": 1218388992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031860581745235707, + "loss": 2.7619, + "theoretical_loss": 3.58243894070886, + "tokens_seen": 1218454528 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031859578736208625, + "loss": 2.9458, + "theoretical_loss": 3.582421267647804, + "tokens_seen": 1218520064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031858575727181543, + "loss": 2.9285, + "theoretical_loss": 3.5824035958033655, + "tokens_seen": 1218585600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031857572718154467, + "loss": 2.945, + "theoretical_loss": 3.5823859251753953, + "tokens_seen": 1218651136 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003185656970912738, + "loss": 2.9306, + "theoretical_loss": 3.5823682557637437, + "tokens_seen": 1218716672 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031855566700100303, + "loss": 2.8048, + "theoretical_loss": 3.5823505875682624, + "tokens_seen": 1218782208 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003185456369107322, + "loss": 2.9774, + "theoretical_loss": 3.582332920588802, + "tokens_seen": 1218847744 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003185356068204614, + "loss": 2.8843, + "theoretical_loss": 3.5823152548252137, + "tokens_seen": 1218913280 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1952726, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8794891834259033, + "objective/train/theoretical_loss": 3.582310838574279, + "objective/train/tokens_used": 1239389664, + "theoretical_loss": 3.582310838574279, + "tokens_seen": 1218929664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031852557673019057, + "loss": 2.6881, + "theoretical_loss": 3.582297590277348, + "tokens_seen": 1218978816 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031851554663991975, + "loss": 2.7893, + "theoretical_loss": 3.582279926945056, + "tokens_seen": 1219044352 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031850551654964893, + "loss": 2.777, + "theoretical_loss": 3.5822622648281888, + "tokens_seen": 1219109888 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031849548645937817, + "loss": 2.8048, + "theoretical_loss": 3.582244603926598, + "tokens_seen": 1219175424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003184854563691073, + "loss": 2.9578, + "theoretical_loss": 3.582226944240133, + "tokens_seen": 1219240960 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031847542627883653, + "loss": 2.8257, + "theoretical_loss": 3.5822092857686467, + "tokens_seen": 1219306496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031846539618856566, + "loss": 2.8709, + "theoretical_loss": 3.5821916285119895, + "tokens_seen": 1219372032 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003184553660982949, + "loss": 2.8181, + "theoretical_loss": 3.582173972470012, + "tokens_seen": 1219437568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003184453360080241, + "loss": 2.9225, + "theoretical_loss": 3.582156317642567, + "tokens_seen": 1219503104 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031843530591775326, + "loss": 2.7901, + "theoretical_loss": 3.5821386640295034, + "tokens_seen": 1219568640 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031842527582748244, + "loss": 2.9094, + "theoretical_loss": 3.582121011630674, + "tokens_seen": 1219634176 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003184152457372116, + "loss": 2.8959, + "theoretical_loss": 3.58210336044593, + "tokens_seen": 1219699712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003184052156469408, + "loss": 2.7541, + "theoretical_loss": 3.5820857104751216, + "tokens_seen": 1219765248 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031839518555667004, + "loss": 2.9652, + "theoretical_loss": 3.5820680617181013, + "tokens_seen": 1219830784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031838515546639916, + "loss": 2.7219, + "theoretical_loss": 3.58205041417472, + "tokens_seen": 1219896320 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003183751253761284, + "loss": 3.1063, + "theoretical_loss": 3.5820327678448294, + "tokens_seen": 1219961856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003183650952858576, + "loss": 2.8764, + "theoretical_loss": 3.5820151227282797, + "tokens_seen": 1220027392 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031835506519558676, + "loss": 3.12, + "theoretical_loss": 3.581997478824923, + "tokens_seen": 1220092928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031834503510531594, + "loss": 2.9604, + "theoretical_loss": 3.5819798361346114, + "tokens_seen": 1220158464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003183350050150451, + "loss": 2.8304, + "theoretical_loss": 3.5819621946571956, + "tokens_seen": 1220224000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003183249749247743, + "loss": 2.8918, + "theoretical_loss": 3.581944554392527, + "tokens_seen": 1220289536 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031831494483450354, + "loss": 2.8438, + "theoretical_loss": 3.5819269153404574, + "tokens_seen": 1220355072 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003183049147442327, + "loss": 2.8585, + "theoretical_loss": 3.5819092775008383, + "tokens_seen": 1220420608 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182948846539619, + "loss": 2.8834, + "theoretical_loss": 3.581891640873521, + "tokens_seen": 1220486144 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182848545636911, + "loss": 2.9492, + "theoretical_loss": 3.5818740054583578, + "tokens_seen": 1220551680 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1955470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.047192335128784, + "objective/train/theoretical_loss": 3.5818695967939487, + "objective/train/tokens_used": 1241028064, + "theoretical_loss": 3.5818695967939487, + "tokens_seen": 1220568064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031827482447342026, + "loss": 2.9618, + "theoretical_loss": 3.581856371255199, + "tokens_seen": 1220617216 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182647943831495, + "loss": 2.869, + "theoretical_loss": 3.5818387382638983, + "tokens_seen": 1220682752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182547642928786, + "loss": 2.6695, + "theoretical_loss": 3.5818211064843055, + "tokens_seen": 1220748288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182547642928786, + "loss": 2.9663, + "theoretical_loss": 3.5818034759162725, + "tokens_seen": 1220813824 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031824473420260786, + "loss": 2.9588, + "theoretical_loss": 3.581785846559652, + "tokens_seen": 1220879360 + }, + { + "epoch": 3.06, + "learning_rate": 0.000318234704112337, + "loss": 2.7521, + "theoretical_loss": 3.581768218414295, + "tokens_seen": 1220944896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182246740220662, + "loss": 2.9038, + "theoretical_loss": 3.5817505914800534, + "tokens_seen": 1221010432 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182146439317954, + "loss": 2.8649, + "theoretical_loss": 3.581732965756779, + "tokens_seen": 1221075968 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003182046138415246, + "loss": 3.0551, + "theoretical_loss": 3.5817153412443234, + "tokens_seen": 1221141504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031819458375125377, + "loss": 2.8455, + "theoretical_loss": 3.5816977179425393, + "tokens_seen": 1221207040 + }, + { + "epoch": 3.06, + "learning_rate": 0.000318184553660983, + "loss": 2.9151, + "theoretical_loss": 3.5816800958512776, + "tokens_seen": 1221272576 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031817452357071213, + "loss": 2.7177, + "theoretical_loss": 3.5816624749703907, + "tokens_seen": 1221338112 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031816449348044136, + "loss": 2.8599, + "theoretical_loss": 3.5816448552997304, + "tokens_seen": 1221403648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003181544633901705, + "loss": 2.8797, + "theoretical_loss": 3.5816272368391484, + "tokens_seen": 1221469184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003181444332998997, + "loss": 2.7888, + "theoretical_loss": 3.581609619588497, + "tokens_seen": 1221534720 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003181344032096289, + "loss": 2.8197, + "theoretical_loss": 3.5815920035476285, + "tokens_seen": 1221600256 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003181243731193581, + "loss": 2.8798, + "theoretical_loss": 3.581574388716394, + "tokens_seen": 1221665792 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031811434302908727, + "loss": 2.7111, + "theoretical_loss": 3.5815567750946466, + "tokens_seen": 1221731328 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031810431293881645, + "loss": 2.9885, + "theoretical_loss": 3.5815391626822377, + "tokens_seen": 1221796864 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031809428284854563, + "loss": 2.7785, + "theoretical_loss": 3.5815215514790193, + "tokens_seen": 1221862400 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031808425275827487, + "loss": 2.6359, + "theoretical_loss": 3.5815039414848444, + "tokens_seen": 1221927936 + }, + { + "epoch": 3.06, + "learning_rate": 0.000318074222668004, + "loss": 2.7466, + "theoretical_loss": 3.5814863326995643, + "tokens_seen": 1221993472 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031806419257773323, + "loss": 2.7932, + "theoretical_loss": 3.581468725123031, + "tokens_seen": 1222059008 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003180541624874624, + "loss": 2.8098, + "theoretical_loss": 3.5814511187550977, + "tokens_seen": 1222124544 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003180441323971916, + "loss": 2.9916, + "theoretical_loss": 3.5814335135956163, + "tokens_seen": 1222190080 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1956960, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.747211217880249, + "objective/train/theoretical_loss": 3.581429112494549, + "objective/train/tokens_used": 1242666464, + "theoretical_loss": 3.581429112494549, + "tokens_seen": 1222206464 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031803410230692077, + "loss": 2.8216, + "theoretical_loss": 3.5814159096444387, + "tokens_seen": 1222255616 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031802407221664995, + "loss": 2.8056, + "theoretical_loss": 3.5813983069014173, + "tokens_seen": 1222321152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031801404212637913, + "loss": 2.8325, + "theoretical_loss": 3.5813807053664046, + "tokens_seen": 1222386688 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031800401203610837, + "loss": 2.7302, + "theoretical_loss": 3.5813631050392525, + "tokens_seen": 1222452224 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003179939819458375, + "loss": 3.0304, + "theoretical_loss": 3.581345505919814, + "tokens_seen": 1222517760 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031798395185556673, + "loss": 2.7481, + "theoretical_loss": 3.5813279080079408, + "tokens_seen": 1222583296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031797392176529586, + "loss": 2.7984, + "theoretical_loss": 3.5813103113034863, + "tokens_seen": 1222648832 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003179638916750251, + "loss": 2.8849, + "theoretical_loss": 3.5812927158063017, + "tokens_seen": 1222714368 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003179538615847543, + "loss": 2.9485, + "theoretical_loss": 3.5812751215162404, + "tokens_seen": 1222779904 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031794383149448346, + "loss": 2.8236, + "theoretical_loss": 3.581257528433155, + "tokens_seen": 1222845440 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031793380140421264, + "loss": 2.7798, + "theoretical_loss": 3.581239936556897, + "tokens_seen": 1222910976 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003179237713139418, + "loss": 2.9024, + "theoretical_loss": 3.58122234588732, + "tokens_seen": 1222976512 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317913741223671, + "loss": 2.9557, + "theoretical_loss": 3.5812047564242757, + "tokens_seen": 1223042048 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031790371113340024, + "loss": 2.7276, + "theoretical_loss": 3.5811871681676175, + "tokens_seen": 1223107584 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031789368104312936, + "loss": 2.8424, + "theoretical_loss": 3.581169581117198, + "tokens_seen": 1223173120 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003178836509528586, + "loss": 2.7058, + "theoretical_loss": 3.5811519952728688, + "tokens_seen": 1223238656 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003178736208625878, + "loss": 2.8093, + "theoretical_loss": 3.581134410634484, + "tokens_seen": 1223304192 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031786359077231696, + "loss": 2.8542, + "theoretical_loss": 3.581116827201895, + "tokens_seen": 1223369728 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031785356068204614, + "loss": 2.8625, + "theoretical_loss": 3.581099244974956, + "tokens_seen": 1223435264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003178435305917753, + "loss": 2.8815, + "theoretical_loss": 3.581081663953518, + "tokens_seen": 1223500800 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003178335005015045, + "loss": 2.6314, + "theoretical_loss": 3.5810640841374353, + "tokens_seen": 1223566336 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031782347041123374, + "loss": 2.878, + "theoretical_loss": 3.58104650552656, + "tokens_seen": 1223631872 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031781344032096287, + "loss": 2.8321, + "theoretical_loss": 3.5810289281207446, + "tokens_seen": 1223697408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003178034102306921, + "loss": 2.8908, + "theoretical_loss": 3.5810113519198428, + "tokens_seen": 1223762944 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031779338014042123, + "loss": 2.6989, + "theoretical_loss": 3.5809937769237066, + "tokens_seen": 1223828480 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1959765, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4698104858398438, + "objective/train/theoretical_loss": 3.5809893833629003, + "objective/train/tokens_used": 1244304864, + "theoretical_loss": 3.5809893833629003, + "tokens_seen": 1223844864 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031778335005015046, + "loss": 2.8051, + "theoretical_loss": 3.5809762031321895, + "tokens_seen": 1223894016 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031777331995987964, + "loss": 2.8871, + "theoretical_loss": 3.580958630545145, + "tokens_seen": 1223959552 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003177632898696088, + "loss": 3.005, + "theoretical_loss": 3.5809410591624244, + "tokens_seen": 1224025088 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317753259779338, + "loss": 2.7067, + "theoretical_loss": 3.580923488983882, + "tokens_seen": 1224090624 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003177432296890672, + "loss": 2.9851, + "theoretical_loss": 3.5809059200093705, + "tokens_seen": 1224156160 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031773319959879637, + "loss": 2.6415, + "theoretical_loss": 3.580888352238743, + "tokens_seen": 1224221696 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003177231695085256, + "loss": 2.9476, + "theoretical_loss": 3.5808707856718525, + "tokens_seen": 1224287232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031771313941825473, + "loss": 2.8698, + "theoretical_loss": 3.580853220308552, + "tokens_seen": 1224352768 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031770310932798397, + "loss": 2.6578, + "theoretical_loss": 3.580835656148695, + "tokens_seen": 1224418304 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031769307923771315, + "loss": 2.9494, + "theoretical_loss": 3.580818093192134, + "tokens_seen": 1224483840 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031768304914744233, + "loss": 2.9479, + "theoretical_loss": 3.5808005314387223, + "tokens_seen": 1224549376 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003176730190571715, + "loss": 2.8503, + "theoretical_loss": 3.5807829708883134, + "tokens_seen": 1224614912 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003176629889669007, + "loss": 2.8152, + "theoretical_loss": 3.5807654115407606, + "tokens_seen": 1224680448 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031765295887662987, + "loss": 2.5837, + "theoretical_loss": 3.580747853395917, + "tokens_seen": 1224745984 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003176429287863591, + "loss": 2.9688, + "theoretical_loss": 3.580730296453636, + "tokens_seen": 1224811520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031763289869608823, + "loss": 2.8181, + "theoretical_loss": 3.58071274071377, + "tokens_seen": 1224877056 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031762286860581747, + "loss": 2.8385, + "theoretical_loss": 3.5806951861761736, + "tokens_seen": 1224942592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003176128385155466, + "loss": 2.9367, + "theoretical_loss": 3.5806776328406995, + "tokens_seen": 1225008128 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031760280842527583, + "loss": 2.9546, + "theoretical_loss": 3.580660080707201, + "tokens_seen": 1225073664 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317592778335005, + "loss": 2.9259, + "theoretical_loss": 3.580642529775532, + "tokens_seen": 1225139200 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003175827482447342, + "loss": 2.8634, + "theoretical_loss": 3.5806249800455454, + "tokens_seen": 1225204736 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003175727181544634, + "loss": 2.7735, + "theoretical_loss": 3.5806074315170946, + "tokens_seen": 1225270272 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003175626880641926, + "loss": 2.8655, + "theoretical_loss": 3.580589884190034, + "tokens_seen": 1225335808 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003175526579739218, + "loss": 2.8632, + "theoretical_loss": 3.580572338064216, + "tokens_seen": 1225401344 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031754262788365097, + "loss": 2.9108, + "theoretical_loss": 3.5805547931394948, + "tokens_seen": 1225466880 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1962462, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8817520141601562, + "objective/train/theoretical_loss": 3.580550407095968, + "objective/train/tokens_used": 1245943264, + "theoretical_loss": 3.580550407095968, + "tokens_seen": 1225483264 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031753259779338015, + "loss": 2.9285, + "theoretical_loss": 3.5805372494157233, + "tokens_seen": 1225532416 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031752256770310933, + "loss": 2.8473, + "theoretical_loss": 3.5805197068927557, + "tokens_seen": 1225597952 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031751253761283857, + "loss": 2.6858, + "theoretical_loss": 3.5805021655704454, + "tokens_seen": 1225663488 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003175025075225677, + "loss": 2.99, + "theoretical_loss": 3.580484625448646, + "tokens_seen": 1225729024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031749247743229693, + "loss": 2.9499, + "theoretical_loss": 3.580467086527211, + "tokens_seen": 1225794560 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031748244734202606, + "loss": 2.8372, + "theoretical_loss": 3.5804495488059946, + "tokens_seen": 1225860096 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003174724172517553, + "loss": 2.94, + "theoretical_loss": 3.58043201228485, + "tokens_seen": 1225925632 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003174623871614845, + "loss": 3.0552, + "theoretical_loss": 3.5804144769636315, + "tokens_seen": 1225991168 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031745235707121366, + "loss": 2.8238, + "theoretical_loss": 3.5803969428421922, + "tokens_seen": 1226056704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031744232698094284, + "loss": 2.9286, + "theoretical_loss": 3.580379409920386, + "tokens_seen": 1226122240 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317432296890672, + "loss": 2.8074, + "theoretical_loss": 3.580361878198067, + "tokens_seen": 1226187776 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003174222668004012, + "loss": 2.7646, + "theoretical_loss": 3.580344347675089, + "tokens_seen": 1226253312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031741223671013044, + "loss": 2.9032, + "theoretical_loss": 3.5803268183513053, + "tokens_seen": 1226318848 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031740220661985956, + "loss": 2.9665, + "theoretical_loss": 3.5803092902265705, + "tokens_seen": 1226384384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003173921765295888, + "loss": 2.9124, + "theoretical_loss": 3.5802917633007385, + "tokens_seen": 1226449920 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317382146439318, + "loss": 2.8506, + "theoretical_loss": 3.580274237573663, + "tokens_seen": 1226515456 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031737211634904716, + "loss": 2.9649, + "theoretical_loss": 3.5802567130451974, + "tokens_seen": 1226580992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031736208625877634, + "loss": 2.7906, + "theoretical_loss": 3.580239189715197, + "tokens_seen": 1226646528 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003173520561685055, + "loss": 2.9778, + "theoretical_loss": 3.5802216675835146, + "tokens_seen": 1226712064 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003173420260782347, + "loss": 2.9551, + "theoretical_loss": 3.580204146650005, + "tokens_seen": 1226777600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031733199598796394, + "loss": 2.8248, + "theoretical_loss": 3.580186626914522, + "tokens_seen": 1226843136 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031732196589769307, + "loss": 2.7694, + "theoretical_loss": 3.5801691083769196, + "tokens_seen": 1226908672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003173119358074223, + "loss": 2.8452, + "theoretical_loss": 3.580151591037052, + "tokens_seen": 1226974208 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031730190571715143, + "loss": 2.7793, + "theoretical_loss": 3.580134074894773, + "tokens_seen": 1227039744 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031729187562688066, + "loss": 2.7159, + "theoretical_loss": 3.580116559949938, + "tokens_seen": 1227105280 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1965393, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9603776931762695, + "objective/train/theoretical_loss": 3.5801121814008123, + "objective/train/tokens_used": 1247581664, + "theoretical_loss": 3.5801121814008123, + "tokens_seen": 1227121664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031728184553660984, + "loss": 2.826, + "theoretical_loss": 3.5800990462024, + "tokens_seen": 1227170816 + }, + { + "epoch": 3.06, + "learning_rate": 0.000317271815446339, + "loss": 2.9091, + "theoretical_loss": 3.580081533652013, + "tokens_seen": 1227236352 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003172617853560682, + "loss": 2.769, + "theoretical_loss": 3.5800640222986315, + "tokens_seen": 1227301888 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003172517552657974, + "loss": 2.7703, + "theoretical_loss": 3.5800465121421112, + "tokens_seen": 1227367424 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031724172517552657, + "loss": 2.8582, + "theoretical_loss": 3.5800290031823043, + "tokens_seen": 1227432960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003172316950852558, + "loss": 2.8887, + "theoretical_loss": 3.5800114954190665, + "tokens_seen": 1227498496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031722166499498493, + "loss": 2.8242, + "theoretical_loss": 3.5799939888522516, + "tokens_seen": 1227564032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031721163490471417, + "loss": 2.9189, + "theoretical_loss": 3.579976483481714, + "tokens_seen": 1227629568 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031720160481444335, + "loss": 2.907, + "theoretical_loss": 3.5799589793073086, + "tokens_seen": 1227695104 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031719157472417253, + "loss": 2.8867, + "theoretical_loss": 3.579941476328889, + "tokens_seen": 1227760640 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003171815446339017, + "loss": 2.7875, + "theoretical_loss": 3.57992397454631, + "tokens_seen": 1227826176 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003171715145436309, + "loss": 2.8947, + "theoretical_loss": 3.579906473959426, + "tokens_seen": 1227891712 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031716148445336007, + "loss": 2.7755, + "theoretical_loss": 3.5798889745680924, + "tokens_seen": 1227957248 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003171514543630893, + "loss": 2.788, + "theoretical_loss": 3.579871476372162, + "tokens_seen": 1228022784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031714142427281843, + "loss": 2.8029, + "theoretical_loss": 3.579853979371491, + "tokens_seen": 1228088320 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031713139418254767, + "loss": 2.8372, + "theoretical_loss": 3.579836483565933, + "tokens_seen": 1228153856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003171213640922768, + "loss": 2.9429, + "theoretical_loss": 3.579818988955343, + "tokens_seen": 1228219392 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031711133400200603, + "loss": 2.7055, + "theoretical_loss": 3.5798014955395754, + "tokens_seen": 1228284928 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003171013039117352, + "loss": 2.8857, + "theoretical_loss": 3.579784003318485, + "tokens_seen": 1228350464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170912738214644, + "loss": 2.819, + "theoretical_loss": 3.5797665122919264, + "tokens_seen": 1228416000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170812437311936, + "loss": 2.8275, + "theoretical_loss": 3.5797490224597546, + "tokens_seen": 1228481536 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170712136409228, + "loss": 2.9829, + "theoretical_loss": 3.579731533821824, + "tokens_seen": 1228547072 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031706118355065194, + "loss": 2.6689, + "theoretical_loss": 3.5797140463779895, + "tokens_seen": 1228612608 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170511534603812, + "loss": 2.8932, + "theoretical_loss": 3.5796965601281054, + "tokens_seen": 1228678144 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170411233701103, + "loss": 2.8171, + "theoretical_loss": 3.5796790750720273, + "tokens_seen": 1228743680 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1968139, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.982083797454834, + "objective/train/theoretical_loss": 3.579674703994523, + "objective/train/tokens_used": 1249220064, + "theoretical_loss": 3.579674703994523, + "tokens_seen": 1228760064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031703109327983954, + "loss": 2.9485, + "theoretical_loss": 3.57966159120961, + "tokens_seen": 1228809216 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170210631895687, + "loss": 2.8156, + "theoretical_loss": 3.579644108540707, + "tokens_seen": 1228874752 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170110330992979, + "loss": 2.874, + "theoretical_loss": 3.579626627065175, + "tokens_seen": 1228940288 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003170010030090271, + "loss": 2.7435, + "theoretical_loss": 3.579609146782868, + "tokens_seen": 1229005824 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031699097291875626, + "loss": 2.8362, + "theoretical_loss": 3.5795916676936415, + "tokens_seen": 1229071360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031698094282848544, + "loss": 2.7977, + "theoretical_loss": 3.579574189797349, + "tokens_seen": 1229136896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003169709127382147, + "loss": 2.4972, + "theoretical_loss": 3.5795567130938473, + "tokens_seen": 1229202432 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003169608826479438, + "loss": 2.8129, + "theoretical_loss": 3.57953923758299, + "tokens_seen": 1229267968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031695085255767304, + "loss": 2.8001, + "theoretical_loss": 3.579521763264633, + "tokens_seen": 1229333504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031694082246740217, + "loss": 2.8249, + "theoretical_loss": 3.579504290138631, + "tokens_seen": 1229399040 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003169307923771314, + "loss": 3.0327, + "theoretical_loss": 3.579486818204839, + "tokens_seen": 1229464576 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003169207622868606, + "loss": 2.9562, + "theoretical_loss": 3.579469347463113, + "tokens_seen": 1229530112 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031691073219658976, + "loss": 2.9637, + "theoretical_loss": 3.579451877913307, + "tokens_seen": 1229595648 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031690070210631894, + "loss": 2.7716, + "theoretical_loss": 3.5794344095552764, + "tokens_seen": 1229661184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003168906720160482, + "loss": 2.9036, + "theoretical_loss": 3.5794169423888764, + "tokens_seen": 1229726720 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003168806419257773, + "loss": 2.8682, + "theoretical_loss": 3.579399476413963, + "tokens_seen": 1229792256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031687061183550654, + "loss": 2.9491, + "theoretical_loss": 3.5793820116303903, + "tokens_seen": 1229857792 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031686058174523567, + "loss": 2.91, + "theoretical_loss": 3.579364548038014, + "tokens_seen": 1229923328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003168505516549649, + "loss": 2.7805, + "theoretical_loss": 3.57934708563669, + "tokens_seen": 1229988864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003168405215646941, + "loss": 2.8699, + "theoretical_loss": 3.579329624426273, + "tokens_seen": 1230054400 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031683049147442327, + "loss": 3.0138, + "theoretical_loss": 3.579312164406618, + "tokens_seen": 1230119936 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031682046138415245, + "loss": 2.8519, + "theoretical_loss": 3.5792947055775812, + "tokens_seen": 1230185472 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031681043129388163, + "loss": 2.8415, + "theoretical_loss": 3.579277247939017, + "tokens_seen": 1230251008 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031680040120361086, + "loss": 2.8186, + "theoretical_loss": 3.5792597914907818, + "tokens_seen": 1230316544 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031679037111334004, + "loss": 2.816, + "theoretical_loss": 3.5792423362327304, + "tokens_seen": 1230382080 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1971045, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.722249984741211, + "objective/train/theoretical_loss": 3.579237972604167, + "objective/train/tokens_used": 1250858464, + "theoretical_loss": 3.579237972604167, + "tokens_seen": 1230398464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003167803410230692, + "loss": 2.7384, + "theoretical_loss": 3.579224882164719, + "tokens_seen": 1230447616 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003167703109327984, + "loss": 2.7255, + "theoretical_loss": 3.579207429286602, + "tokens_seen": 1230513152 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003167602808425276, + "loss": 3.071, + "theoretical_loss": 3.5791899775982357, + "tokens_seen": 1230578688 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031675025075225677, + "loss": 2.7769, + "theoretical_loss": 3.579172527099476, + "tokens_seen": 1230644224 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316740220661986, + "loss": 2.8084, + "theoretical_loss": 3.579155077790177, + "tokens_seen": 1230709760 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031673019057171513, + "loss": 3.0117, + "theoretical_loss": 3.5791376296701953, + "tokens_seen": 1230775296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031672016048144437, + "loss": 2.9168, + "theoretical_loss": 3.579120182739387, + "tokens_seen": 1230840832 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031671013039117355, + "loss": 2.8262, + "theoretical_loss": 3.5791027369976067, + "tokens_seen": 1230906368 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031670010030090273, + "loss": 2.8165, + "theoretical_loss": 3.5790852924447103, + "tokens_seen": 1230971904 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003166900702106319, + "loss": 2.9136, + "theoretical_loss": 3.5790678490805545, + "tokens_seen": 1231037440 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003166800401203611, + "loss": 2.8556, + "theoretical_loss": 3.5790504069049938, + "tokens_seen": 1231102976 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031667001003009027, + "loss": 2.8894, + "theoretical_loss": 3.579032965917884, + "tokens_seen": 1231168512 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003166599799398195, + "loss": 2.8279, + "theoretical_loss": 3.5790155261190817, + "tokens_seen": 1231234048 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031664994984954863, + "loss": 2.8734, + "theoretical_loss": 3.5789980875084417, + "tokens_seen": 1231299584 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031663991975927787, + "loss": 3.0064, + "theoretical_loss": 3.5789806500858212, + "tokens_seen": 1231365120 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316629889669007, + "loss": 2.8492, + "theoretical_loss": 3.5789632138510745, + "tokens_seen": 1231430656 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031661985957873623, + "loss": 2.8362, + "theoretical_loss": 3.578945778804058, + "tokens_seen": 1231496192 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003166098294884654, + "loss": 2.7538, + "theoretical_loss": 3.5789283449446283, + "tokens_seen": 1231561728 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165997993981946, + "loss": 2.6415, + "theoretical_loss": 3.5789109122726406, + "tokens_seen": 1231627264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165897693079238, + "loss": 2.655, + "theoretical_loss": 3.578893480787951, + "tokens_seen": 1231692800 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316579739217653, + "loss": 2.8066, + "theoretical_loss": 3.5788760504904156, + "tokens_seen": 1231758336 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031656970912738214, + "loss": 3.0002, + "theoretical_loss": 3.57885862137989, + "tokens_seen": 1231823872 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165596790371114, + "loss": 2.8541, + "theoretical_loss": 3.5788411934562303, + "tokens_seen": 1231889408 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165496489468405, + "loss": 2.8773, + "theoretical_loss": 3.5788237667192933, + "tokens_seen": 1231954944 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031653961885656974, + "loss": 2.8965, + "theoretical_loss": 3.578806341168934, + "tokens_seen": 1232020480 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1973510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.458749771118164, + "objective/train/theoretical_loss": 3.5788019849667303, + "objective/train/tokens_used": 1252496864, + "theoretical_loss": 3.5788019849667303, + "tokens_seen": 1232036864 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165295887662989, + "loss": 2.7106, + "theoretical_loss": 3.578788916805009, + "tokens_seen": 1232086016 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165195586760281, + "loss": 2.837, + "theoretical_loss": 3.578771493627375, + "tokens_seen": 1232151552 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003165095285857573, + "loss": 2.9936, + "theoretical_loss": 3.5787540716358874, + "tokens_seen": 1232217088 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031649949849548646, + "loss": 2.6864, + "theoretical_loss": 3.578736650830402, + "tokens_seen": 1232282624 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031648946840521564, + "loss": 2.8892, + "theoretical_loss": 3.5787192312107763, + "tokens_seen": 1232348160 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003164794383149449, + "loss": 3.0269, + "theoretical_loss": 3.578701812776865, + "tokens_seen": 1232413696 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316469408224674, + "loss": 2.8158, + "theoretical_loss": 3.5786843955285255, + "tokens_seen": 1232479232 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031645937813440324, + "loss": 3.0304, + "theoretical_loss": 3.5786669794656136, + "tokens_seen": 1232544768 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031644934804413237, + "loss": 2.9431, + "theoretical_loss": 3.578649564587985, + "tokens_seen": 1232610304 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003164393179538616, + "loss": 2.951, + "theoretical_loss": 3.5786321508954977, + "tokens_seen": 1232675840 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003164292878635908, + "loss": 2.9653, + "theoretical_loss": 3.5786147383880067, + "tokens_seen": 1232741376 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031641925777331996, + "loss": 2.9272, + "theoretical_loss": 3.578597327065368, + "tokens_seen": 1232806912 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031640922768304914, + "loss": 2.9352, + "theoretical_loss": 3.5785799169274393, + "tokens_seen": 1232872448 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003163991975927784, + "loss": 2.7863, + "theoretical_loss": 3.578562507974077, + "tokens_seen": 1232937984 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003163891675025075, + "loss": 2.668, + "theoretical_loss": 3.578545100205136, + "tokens_seen": 1233003520 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031637913741223674, + "loss": 2.9347, + "theoretical_loss": 3.5785276936204737, + "tokens_seen": 1233069056 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031636910732196587, + "loss": 2.8254, + "theoretical_loss": 3.578510288219947, + "tokens_seen": 1233134592 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003163590772316951, + "loss": 2.974, + "theoretical_loss": 3.5784928840034116, + "tokens_seen": 1233200128 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003163490471414243, + "loss": 3.0265, + "theoretical_loss": 3.578475480970725, + "tokens_seen": 1233265664 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031633901705115347, + "loss": 2.6945, + "theoretical_loss": 3.5784580791217433, + "tokens_seen": 1233331200 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031632898696088265, + "loss": 2.9354, + "theoretical_loss": 3.5784406784563227, + "tokens_seen": 1233396736 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031631895687061183, + "loss": 2.8915, + "theoretical_loss": 3.57842327897432, + "tokens_seen": 1233462272 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316308926780341, + "loss": 2.8581, + "theoretical_loss": 3.5784058806755925, + "tokens_seen": 1233527808 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031629889669007024, + "loss": 2.9117, + "theoretical_loss": 3.578388483559996, + "tokens_seen": 1233593344 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031628886659979937, + "loss": 2.849, + "theoretical_loss": 3.5783710876273878, + "tokens_seen": 1233658880 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1974912, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8697381019592285, + "objective/train/theoretical_loss": 3.578366738829061, + "objective/train/tokens_used": 1254135264, + "theoretical_loss": 3.578366738829061, + "tokens_seen": 1233675264 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003162788365095286, + "loss": 2.8436, + "theoretical_loss": 3.5783536928776245, + "tokens_seen": 1233724416 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031626880641925773, + "loss": 2.8794, + "theoretical_loss": 3.5783362993105623, + "tokens_seen": 1233789952 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031625877632898697, + "loss": 2.8952, + "theoretical_loss": 3.578318906926059, + "tokens_seen": 1233855488 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031624874623871615, + "loss": 2.9367, + "theoretical_loss": 3.57830151572397, + "tokens_seen": 1233921024 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031623871614844533, + "loss": 2.7671, + "theoretical_loss": 3.5782841257041538, + "tokens_seen": 1233986560 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003162286860581745, + "loss": 2.8865, + "theoretical_loss": 3.578266736866466, + "tokens_seen": 1234052096 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031621865596790375, + "loss": 2.8363, + "theoretical_loss": 3.578249349210764, + "tokens_seen": 1234117632 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003162086258776329, + "loss": 2.9601, + "theoretical_loss": 3.578231962736904, + "tokens_seen": 1234183168 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003161985957873621, + "loss": 2.7644, + "theoretical_loss": 3.5782145774447436, + "tokens_seen": 1234248704 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031618856569709124, + "loss": 2.7842, + "theoretical_loss": 3.57819719333414, + "tokens_seen": 1234314240 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031617853560682047, + "loss": 2.9103, + "theoretical_loss": 3.5781798104049494, + "tokens_seen": 1234379776 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031616850551654965, + "loss": 2.6467, + "theoretical_loss": 3.5781624286570297, + "tokens_seen": 1234445312 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031615847542627883, + "loss": 2.8557, + "theoretical_loss": 3.578145048090237, + "tokens_seen": 1234510848 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316148445336008, + "loss": 2.7838, + "theoretical_loss": 3.578127668704428, + "tokens_seen": 1234576384 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003161384152457372, + "loss": 2.7562, + "theoretical_loss": 3.5781102904994615, + "tokens_seen": 1234641920 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003161283851554664, + "loss": 2.582, + "theoretical_loss": 3.5780929134751935, + "tokens_seen": 1234707456 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003161183550651956, + "loss": 2.9829, + "theoretical_loss": 3.578075537631481, + "tokens_seen": 1234772992 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031610832497492474, + "loss": 2.8844, + "theoretical_loss": 3.578058162968181, + "tokens_seen": 1234838528 + }, + { + "epoch": 3.06, + "learning_rate": 0.000316098294884654, + "loss": 2.9364, + "theoretical_loss": 3.5780407894851516, + "tokens_seen": 1234904064 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160882647943831, + "loss": 2.6904, + "theoretical_loss": 3.578023417182249, + "tokens_seen": 1234969600 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031607823470411234, + "loss": 2.7689, + "theoretical_loss": 3.578006046059331, + "tokens_seen": 1235035136 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160682046138415, + "loss": 2.8666, + "theoretical_loss": 3.577988676116255, + "tokens_seen": 1235100672 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160581745235707, + "loss": 2.8151, + "theoretical_loss": 3.5779713073528776, + "tokens_seen": 1235166208 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031604814443329994, + "loss": 2.9316, + "theoretical_loss": 3.5779539397690563, + "tokens_seen": 1235231744 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160381143430291, + "loss": 2.8096, + "theoretical_loss": 3.577936573364649, + "tokens_seen": 1235297280 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1977647, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8676016330718994, + "objective/train/theoretical_loss": 3.5779322319478135, + "objective/train/tokens_used": 1255773664, + "theoretical_loss": 3.5779322319478135, + "tokens_seen": 1235313664 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160280842527583, + "loss": 2.9182, + "theoretical_loss": 3.577919208139512, + "tokens_seen": 1235362816 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003160180541624875, + "loss": 2.9083, + "theoretical_loss": 3.5779018440935033, + "tokens_seen": 1235428352 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031600802407221666, + "loss": 2.8151, + "theoretical_loss": 3.577884481226481, + "tokens_seen": 1235493888 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031599799398194584, + "loss": 2.9025, + "theoretical_loss": 3.5778671195383014, + "tokens_seen": 1235559424 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003159879638916751, + "loss": 2.6576, + "theoretical_loss": 3.5778497590288216, + "tokens_seen": 1235624960 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003159779338014042, + "loss": 2.8698, + "theoretical_loss": 3.5778323996979005, + "tokens_seen": 1235690496 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031596790371113344, + "loss": 2.9358, + "theoretical_loss": 3.577815041545395, + "tokens_seen": 1235756032 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031595787362086257, + "loss": 2.8503, + "theoretical_loss": 3.577797684571162, + "tokens_seen": 1235821568 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003159478435305918, + "loss": 3.0676, + "theoretical_loss": 3.57778032877506, + "tokens_seen": 1235887104 + }, + { + "epoch": 3.06, + "learning_rate": 0.000315937813440321, + "loss": 2.9366, + "theoretical_loss": 3.5777629741569457, + "tokens_seen": 1235952640 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031592778335005016, + "loss": 3.0118, + "theoretical_loss": 3.5777456207166773, + "tokens_seen": 1236018176 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031591775325977934, + "loss": 2.9422, + "theoretical_loss": 3.577728268454112, + "tokens_seen": 1236083712 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003159077231695086, + "loss": 2.8191, + "theoretical_loss": 3.577710917369108, + "tokens_seen": 1236149248 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003158976930792377, + "loss": 2.9456, + "theoretical_loss": 3.5776935674615222, + "tokens_seen": 1236214784 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031588766298896694, + "loss": 2.7967, + "theoretical_loss": 3.577676218731213, + "tokens_seen": 1236280320 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031587763289869607, + "loss": 2.796, + "theoretical_loss": 3.5776588711780377, + "tokens_seen": 1236345856 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003158676028084253, + "loss": 2.7628, + "theoretical_loss": 3.5776415248018543, + "tokens_seen": 1236411392 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003158575727181545, + "loss": 2.8967, + "theoretical_loss": 3.57762417960252, + "tokens_seen": 1236476928 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031584754262788367, + "loss": 2.8481, + "theoretical_loss": 3.5776068355798936, + "tokens_seen": 1236542464 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031583751253761285, + "loss": 2.9777, + "theoretical_loss": 3.5775894927338316, + "tokens_seen": 1236608000 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031582748244734203, + "loss": 2.9075, + "theoretical_loss": 3.577572151064193, + "tokens_seen": 1236673536 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003158174523570712, + "loss": 2.9001, + "theoretical_loss": 3.577554810570835, + "tokens_seen": 1236739072 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031580742226680044, + "loss": 2.959, + "theoretical_loss": 3.577537471253616, + "tokens_seen": 1236804608 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031579739217652957, + "loss": 2.7772, + "theoretical_loss": 3.577520133112393, + "tokens_seen": 1236870144 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003157873620862588, + "loss": 2.896, + "theoretical_loss": 3.577502796147025, + "tokens_seen": 1236935680 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1980617, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.026431083679199, + "objective/train/theoretical_loss": 3.5774984620893937, + "objective/train/tokens_used": 1257412064, + "theoretical_loss": 3.5774984620893937, + "tokens_seen": 1236952064 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031577733199598793, + "loss": 2.8043, + "theoretical_loss": 3.5774854603573694, + "tokens_seen": 1237001216 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031576730190571717, + "loss": 2.9857, + "theoretical_loss": 3.5774681257432848, + "tokens_seen": 1237066752 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031575727181544635, + "loss": 2.8609, + "theoretical_loss": 3.577450792304628, + "tokens_seen": 1237132288 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031574724172517553, + "loss": 2.8214, + "theoretical_loss": 3.577433460041258, + "tokens_seen": 1237197824 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003157372116349047, + "loss": 2.7327, + "theoretical_loss": 3.5774161289530326, + "tokens_seen": 1237263360 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031572718154463395, + "loss": 2.7549, + "theoretical_loss": 3.57739879903981, + "tokens_seen": 1237328896 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003157171514543631, + "loss": 2.7509, + "theoretical_loss": 3.577381470301448, + "tokens_seen": 1237394432 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003157071213640923, + "loss": 2.9685, + "theoretical_loss": 3.577364142737805, + "tokens_seen": 1237459968 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031569709127382144, + "loss": 2.5962, + "theoretical_loss": 3.577346816348739, + "tokens_seen": 1237525504 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031568706118355067, + "loss": 2.7441, + "theoretical_loss": 3.5773294911341083, + "tokens_seen": 1237591040 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031567703109327985, + "loss": 2.8846, + "theoretical_loss": 3.5773121670937713, + "tokens_seen": 1237656576 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031566700100300903, + "loss": 2.8224, + "theoretical_loss": 3.577294844227586, + "tokens_seen": 1237722112 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003156569709127382, + "loss": 2.7563, + "theoretical_loss": 3.57727752253541, + "tokens_seen": 1237787648 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003156469408224674, + "loss": 2.9653, + "theoretical_loss": 3.5772602020171034, + "tokens_seen": 1237853184 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003156369107321966, + "loss": 2.8028, + "theoretical_loss": 3.5772428826725227, + "tokens_seen": 1237918720 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003156268806419258, + "loss": 2.7824, + "theoretical_loss": 3.5772255645015267, + "tokens_seen": 1237984256 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031561685055165494, + "loss": 2.8788, + "theoretical_loss": 3.5772082475039744, + "tokens_seen": 1238049792 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003156068204613842, + "loss": 2.8251, + "theoretical_loss": 3.5771909316797235, + "tokens_seen": 1238115328 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155967903711133, + "loss": 2.7453, + "theoretical_loss": 3.5771736170286323, + "tokens_seen": 1238180864 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031558676028084254, + "loss": 2.9039, + "theoretical_loss": 3.5771563035505602, + "tokens_seen": 1238246400 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155767301905717, + "loss": 2.8068, + "theoretical_loss": 3.5771389912453646, + "tokens_seen": 1238311936 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155667001003009, + "loss": 2.7271, + "theoretical_loss": 3.5771216801129047, + "tokens_seen": 1238377472 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155566700100301, + "loss": 2.7637, + "theoretical_loss": 3.5771043701530383, + "tokens_seen": 1238443008 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155466399197593, + "loss": 2.8267, + "theoretical_loss": 3.5770870613656243, + "tokens_seen": 1238508544 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031553660982948844, + "loss": 2.7975, + "theoretical_loss": 3.5770697537505214, + "tokens_seen": 1238574080 + }, + { + "epoch": 3.06, + "objective/train/docs_used": 1983470, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.801051139831543, + "objective/train/theoretical_loss": 3.577065427029903, + "objective/train/tokens_used": 1259050464, + "theoretical_loss": 3.577065427029903, + "tokens_seen": 1238590464 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155265797392177, + "loss": 2.691, + "theoretical_loss": 3.5770524473075884, + "tokens_seen": 1238639616 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003155165496489468, + "loss": 2.8506, + "theoretical_loss": 3.5770351420366833, + "tokens_seen": 1238705152 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031550651955867604, + "loss": 2.7827, + "theoretical_loss": 3.5770178379376647, + "tokens_seen": 1238770688 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154964894684052, + "loss": 2.7648, + "theoretical_loss": 3.5770005350103915, + "tokens_seen": 1238836224 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154864593781344, + "loss": 2.8228, + "theoretical_loss": 3.576983233254723, + "tokens_seen": 1238901760 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154764292878636, + "loss": 2.9299, + "theoretical_loss": 3.576965932670517, + "tokens_seen": 1238967296 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031546639919759277, + "loss": 2.7275, + "theoretical_loss": 3.576948633257633, + "tokens_seen": 1239032832 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031545636910732195, + "loss": 2.9066, + "theoretical_loss": 3.576931335015928, + "tokens_seen": 1239098368 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154463390170512, + "loss": 2.8696, + "theoretical_loss": 3.5769140379452633, + "tokens_seen": 1239163904 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154363089267803, + "loss": 2.893, + "theoretical_loss": 3.576896742045496, + "tokens_seen": 1239229440 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031542627883650954, + "loss": 2.8819, + "theoretical_loss": 3.576879447316485, + "tokens_seen": 1239294976 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154162487462387, + "loss": 2.7905, + "theoretical_loss": 3.57686215375809, + "tokens_seen": 1239360512 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003154062186559679, + "loss": 3.0042, + "theoretical_loss": 3.5768448613701693, + "tokens_seen": 1239426048 + }, + { + "epoch": 3.06, + "learning_rate": 0.0003153961885656971, + "loss": 2.7219, + "theoretical_loss": 3.576827570152582, + "tokens_seen": 1239491584 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031538615847542627, + "loss": 2.865, + "theoretical_loss": 3.576810280105186, + "tokens_seen": 1239557120 + }, + { + "epoch": 3.06, + "learning_rate": 0.00031537612838515545, + "loss": 2.6953, + "theoretical_loss": 3.5767929912278422, + "tokens_seen": 1239622656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003153660982948847, + "loss": 2.9252, + "theoretical_loss": 3.5767757035204077, + "tokens_seen": 1239688192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003153560682046138, + "loss": 2.9714, + "theoretical_loss": 3.5767584169827433, + "tokens_seen": 1239753728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031534603811434305, + "loss": 2.8959, + "theoretical_loss": 3.5767411316147064, + "tokens_seen": 1239819264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003153360080240722, + "loss": 2.8595, + "theoretical_loss": 3.5767238474161567, + "tokens_seen": 1239884800 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003153259779338014, + "loss": 2.7701, + "theoretical_loss": 3.576706564386953, + "tokens_seen": 1239950336 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003153159478435306, + "loss": 2.8438, + "theoretical_loss": 3.576689282526955, + "tokens_seen": 1240015872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031530591775325977, + "loss": 2.9045, + "theoretical_loss": 3.5766720018360214, + "tokens_seen": 1240081408 + }, + { + "epoch": 3.07, + "learning_rate": 0.000315295887662989, + "loss": 2.8309, + "theoretical_loss": 3.5766547223140113, + "tokens_seen": 1240146944 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031528585757271813, + "loss": 3.0029, + "theoretical_loss": 3.576637443960784, + "tokens_seen": 1240212480 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1985980, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.867603302001953, + "objective/train/theoretical_loss": 3.576633124555083, + "objective/train/tokens_used": 1260688864, + "theoretical_loss": 3.576633124555083, + "tokens_seen": 1240228864 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031527582748244737, + "loss": 2.8379, + "theoretical_loss": 3.5766201667761983, + "tokens_seen": 1240278016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031526579739217655, + "loss": 3.0649, + "theoretical_loss": 3.5766028907601144, + "tokens_seen": 1240343552 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031525576730190573, + "loss": 2.8798, + "theoretical_loss": 3.57658561591239, + "tokens_seen": 1240409088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003152457372116349, + "loss": 2.8195, + "theoretical_loss": 3.576568342232886, + "tokens_seen": 1240474624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031523570712136415, + "loss": 2.8313, + "theoretical_loss": 3.576551069721461, + "tokens_seen": 1240540160 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003152256770310933, + "loss": 2.879, + "theoretical_loss": 3.576533798377974, + "tokens_seen": 1240605696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003152156469408225, + "loss": 3.0363, + "theoretical_loss": 3.5765165282022844, + "tokens_seen": 1240671232 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031520561685055164, + "loss": 2.9176, + "theoretical_loss": 3.5764992591942515, + "tokens_seen": 1240736768 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031519558676028087, + "loss": 2.8887, + "theoretical_loss": 3.5764819913537353, + "tokens_seen": 1240802304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031518555667001005, + "loss": 2.8947, + "theoretical_loss": 3.5764647246805947, + "tokens_seen": 1240867840 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031517552657973924, + "loss": 2.8408, + "theoretical_loss": 3.5764474591746893, + "tokens_seen": 1240933376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003151654964894684, + "loss": 2.9006, + "theoretical_loss": 3.5764301948358783, + "tokens_seen": 1240998912 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003151554663991976, + "loss": 2.995, + "theoretical_loss": 3.576412931664022, + "tokens_seen": 1241064448 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003151454363089268, + "loss": 2.6922, + "theoretical_loss": 3.5763956696589787, + "tokens_seen": 1241129984 + }, + { + "epoch": 3.07, + "learning_rate": 0.000315135406218656, + "loss": 2.8119, + "theoretical_loss": 3.5763784088206085, + "tokens_seen": 1241195520 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031512537612838514, + "loss": 2.8923, + "theoretical_loss": 3.5763611491487715, + "tokens_seen": 1241261056 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003151153460381144, + "loss": 2.8699, + "theoretical_loss": 3.5763438906433267, + "tokens_seen": 1241326592 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003151053159478435, + "loss": 2.8017, + "theoretical_loss": 3.5763266333041335, + "tokens_seen": 1241392128 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031509528585757274, + "loss": 2.8231, + "theoretical_loss": 3.576309377131052, + "tokens_seen": 1241457664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150852557673019, + "loss": 2.8966, + "theoretical_loss": 3.576292122123941, + "tokens_seen": 1241523200 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150752256770311, + "loss": 2.992, + "theoretical_loss": 3.5762748682826615, + "tokens_seen": 1241588736 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150651955867603, + "loss": 2.7627, + "theoretical_loss": 3.5762576156070724, + "tokens_seen": 1241654272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150551654964895, + "loss": 2.8549, + "theoretical_loss": 3.5762403640970333, + "tokens_seen": 1241719808 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031504513540621864, + "loss": 2.8256, + "theoretical_loss": 3.5762231137524045, + "tokens_seen": 1241785344 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150351053159479, + "loss": 2.9178, + "theoretical_loss": 3.5762058645730455, + "tokens_seen": 1241850880 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1988886, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.133693218231201, + "objective/train/theoretical_loss": 3.5762015524602626, + "objective/train/tokens_used": 1262327264, + "theoretical_loss": 3.5762015524602626, + "tokens_seen": 1241867264 + }, + { + "epoch": 3.07, + "learning_rate": 0.000315025075225677, + "loss": 3.0453, + "theoretical_loss": 3.5761886165588157, + "tokens_seen": 1241916416 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031501504513540624, + "loss": 2.9311, + "theoretical_loss": 3.5761713697095754, + "tokens_seen": 1241981952 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003150050150451354, + "loss": 2.9895, + "theoretical_loss": 3.5761541240251846, + "tokens_seen": 1242047488 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149949849548646, + "loss": 2.9516, + "theoretical_loss": 3.5761368795055026, + "tokens_seen": 1242113024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149849548645938, + "loss": 2.8228, + "theoretical_loss": 3.5761196361503895, + "tokens_seen": 1242178560 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031497492477432297, + "loss": 2.8385, + "theoretical_loss": 3.576102393959706, + "tokens_seen": 1242244096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031496489468405215, + "loss": 2.882, + "theoretical_loss": 3.5760851529333104, + "tokens_seen": 1242309632 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149548645937814, + "loss": 2.7991, + "theoretical_loss": 3.576067913071064, + "tokens_seen": 1242375168 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149448345035105, + "loss": 2.9638, + "theoretical_loss": 3.5760506743728264, + "tokens_seen": 1242440704 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031493480441323974, + "loss": 2.8184, + "theoretical_loss": 3.576033436838458, + "tokens_seen": 1242506240 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149247743229689, + "loss": 2.9385, + "theoretical_loss": 3.5760162004678184, + "tokens_seen": 1242571776 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149147442326981, + "loss": 2.8232, + "theoretical_loss": 3.5759989652607675, + "tokens_seen": 1242637312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003149047141424273, + "loss": 2.738, + "theoretical_loss": 3.5759817312171656, + "tokens_seen": 1242702848 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031489468405215647, + "loss": 2.991, + "theoretical_loss": 3.5759644983368726, + "tokens_seen": 1242768384 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031488465396188565, + "loss": 2.821, + "theoretical_loss": 3.575947266619749, + "tokens_seen": 1242833920 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003148746238716149, + "loss": 2.7903, + "theoretical_loss": 3.575930036065655, + "tokens_seen": 1242899456 + }, + { + "epoch": 3.07, + "learning_rate": 0.000314864593781344, + "loss": 2.9733, + "theoretical_loss": 3.5759128066744506, + "tokens_seen": 1242964992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031485456369107325, + "loss": 2.9114, + "theoretical_loss": 3.575895578445996, + "tokens_seen": 1243030528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003148445336008024, + "loss": 2.9238, + "theoretical_loss": 3.5758783513801515, + "tokens_seen": 1243096064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003148345035105316, + "loss": 2.7555, + "theoretical_loss": 3.575861125476777, + "tokens_seen": 1243161600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003148244734202608, + "loss": 2.803, + "theoretical_loss": 3.5758439007357334, + "tokens_seen": 1243227136 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031481444332998997, + "loss": 2.8771, + "theoretical_loss": 3.575826677156881, + "tokens_seen": 1243292672 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031480441323971915, + "loss": 2.7688, + "theoretical_loss": 3.575809454740079, + "tokens_seen": 1243358208 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031479438314944833, + "loss": 2.8163, + "theoretical_loss": 3.5757922334851893, + "tokens_seen": 1243423744 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003147843530591775, + "loss": 2.7554, + "theoretical_loss": 3.5757750133920716, + "tokens_seen": 1243489280 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1991679, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.93513822555542, + "objective/train/theoretical_loss": 3.5757707085503023, + "objective/train/tokens_used": 1263965664, + "theoretical_loss": 3.5757707085503023, + "tokens_seen": 1243505664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031477432296890675, + "loss": 2.8625, + "theoretical_loss": 3.5757577944605856, + "tokens_seen": 1243554816 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003147642928786359, + "loss": 2.9341, + "theoretical_loss": 3.5757405766905928, + "tokens_seen": 1243620352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003147542627883651, + "loss": 2.8636, + "theoretical_loss": 3.5757233600819527, + "tokens_seen": 1243685888 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003147442326980943, + "loss": 2.7848, + "theoretical_loss": 3.575706144634527, + "tokens_seen": 1243751424 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003147342026078235, + "loss": 2.9333, + "theoretical_loss": 3.575688930348176, + "tokens_seen": 1243816960 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031472417251755266, + "loss": 3.0119, + "theoretical_loss": 3.5756717172227583, + "tokens_seen": 1243882496 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031471414242728184, + "loss": 2.8887, + "theoretical_loss": 3.5756545052581368, + "tokens_seen": 1243948032 + }, + { + "epoch": 3.07, + "learning_rate": 0.000314704112337011, + "loss": 2.9229, + "theoretical_loss": 3.575637294454171, + "tokens_seen": 1244013568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031469408224674025, + "loss": 2.8804, + "theoretical_loss": 3.575620084810722, + "tokens_seen": 1244079104 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003146840521564694, + "loss": 2.844, + "theoretical_loss": 3.575602876327649, + "tokens_seen": 1244144640 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003146740220661986, + "loss": 2.8803, + "theoretical_loss": 3.5755856690048144, + "tokens_seen": 1244210176 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031466399197592774, + "loss": 2.733, + "theoretical_loss": 3.575568462842078, + "tokens_seen": 1244275712 + }, + { + "epoch": 3.07, + "learning_rate": 0.000314653961885657, + "loss": 2.9059, + "theoretical_loss": 3.575551257839301, + "tokens_seen": 1244341248 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031464393179538616, + "loss": 2.7508, + "theoretical_loss": 3.5755340539963436, + "tokens_seen": 1244406784 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031463390170511534, + "loss": 2.9021, + "theoretical_loss": 3.5755168513130666, + "tokens_seen": 1244472320 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003146238716148445, + "loss": 2.6836, + "theoretical_loss": 3.5754996497893314, + "tokens_seen": 1244537856 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003146138415245737, + "loss": 2.9191, + "theoretical_loss": 3.5754824494249977, + "tokens_seen": 1244603392 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003146038114343029, + "loss": 2.8079, + "theoretical_loss": 3.575465250219927, + "tokens_seen": 1244668928 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145937813440321, + "loss": 2.7936, + "theoretical_loss": 3.57544805217398, + "tokens_seen": 1244734464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031458375125376125, + "loss": 2.8584, + "theoretical_loss": 3.575430855287018, + "tokens_seen": 1244800000 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145737211634905, + "loss": 2.866, + "theoretical_loss": 3.575413659558901, + "tokens_seen": 1244865536 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145636910732197, + "loss": 2.9396, + "theoretical_loss": 3.57539646498949, + "tokens_seen": 1244931072 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031455366098294884, + "loss": 2.8146, + "theoretical_loss": 3.575379271578647, + "tokens_seen": 1244996608 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145436308926781, + "loss": 2.868, + "theoretical_loss": 3.575362079326232, + "tokens_seen": 1245062144 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145336008024072, + "loss": 2.763, + "theoretical_loss": 3.5753448882321064, + "tokens_seen": 1245127680 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1994510, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0713753700256348, + "objective/train/theoretical_loss": 3.5753405906395415, + "objective/train/tokens_used": 1265604064, + "theoretical_loss": 3.5753405906395415, + "tokens_seen": 1245144064 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031452357071213644, + "loss": 2.856, + "theoretical_loss": 3.575327698296131, + "tokens_seen": 1245193216 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145135406218656, + "loss": 2.9407, + "theoretical_loss": 3.5753105095181668, + "tokens_seen": 1245258752 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003145035105315948, + "loss": 2.8378, + "theoretical_loss": 3.5752933218980747, + "tokens_seen": 1245324288 + }, + { + "epoch": 3.07, + "learning_rate": 0.000314493480441324, + "loss": 2.8774, + "theoretical_loss": 3.5752761354357165, + "tokens_seen": 1245389824 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031448345035105317, + "loss": 2.922, + "theoretical_loss": 3.5752589501309524, + "tokens_seen": 1245455360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031447342026078235, + "loss": 3.0659, + "theoretical_loss": 3.5752417659836437, + "tokens_seen": 1245520896 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003144633901705116, + "loss": 2.9355, + "theoretical_loss": 3.575224582993652, + "tokens_seen": 1245586432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003144533600802407, + "loss": 2.9101, + "theoretical_loss": 3.5752074011608386, + "tokens_seen": 1245651968 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031444332998996994, + "loss": 2.7711, + "theoretical_loss": 3.575190220485064, + "tokens_seen": 1245717504 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003144332998996991, + "loss": 2.8254, + "theoretical_loss": 3.57517304096619, + "tokens_seen": 1245783040 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003144232698094283, + "loss": 2.9801, + "theoretical_loss": 3.575155862604077, + "tokens_seen": 1245848576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003144132397191575, + "loss": 2.8482, + "theoretical_loss": 3.5751386853985876, + "tokens_seen": 1245914112 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031440320962888667, + "loss": 2.8376, + "theoretical_loss": 3.575121509349582, + "tokens_seen": 1245979648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031439317953861585, + "loss": 2.8303, + "theoretical_loss": 3.5751043344569218, + "tokens_seen": 1246045184 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003143831494483451, + "loss": 2.8619, + "theoretical_loss": 3.5750871607204684, + "tokens_seen": 1246110720 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003143731193580742, + "loss": 2.8591, + "theoretical_loss": 3.575069988140083, + "tokens_seen": 1246176256 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031436308926780345, + "loss": 2.9409, + "theoretical_loss": 3.5750528167156275, + "tokens_seen": 1246241792 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003143530591775326, + "loss": 2.7582, + "theoretical_loss": 3.5750356464469633, + "tokens_seen": 1246307328 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003143430290872618, + "loss": 2.9266, + "theoretical_loss": 3.575018477333951, + "tokens_seen": 1246372864 + }, + { + "epoch": 3.07, + "learning_rate": 0.000314332998996991, + "loss": 2.8115, + "theoretical_loss": 3.5750013093764523, + "tokens_seen": 1246438400 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031432296890672017, + "loss": 2.9633, + "theoretical_loss": 3.5749841425743294, + "tokens_seen": 1246503936 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031431293881644935, + "loss": 2.9314, + "theoretical_loss": 3.5749669769274433, + "tokens_seen": 1246569472 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031430290872617853, + "loss": 2.891, + "theoretical_loss": 3.5749498124356553, + "tokens_seen": 1246635008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142928786359077, + "loss": 2.8867, + "theoretical_loss": 3.5749326490988276, + "tokens_seen": 1246700544 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031428284854563695, + "loss": 2.7969, + "theoretical_loss": 3.5749154869168214, + "tokens_seen": 1246766080 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1995821, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.834346294403076, + "objective/train/theoretical_loss": 3.5749111965517444, + "objective/train/tokens_used": 1267242464, + "theoretical_loss": 3.5749111965517444, + "tokens_seen": 1246782464 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142728184553661, + "loss": 2.8968, + "theoretical_loss": 3.574898325889498, + "tokens_seen": 1246831616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142627883650953, + "loss": 2.721, + "theoretical_loss": 3.57488116601672, + "tokens_seen": 1246897152 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142527582748245, + "loss": 2.9885, + "theoretical_loss": 3.5748640072983475, + "tokens_seen": 1246962688 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142427281845537, + "loss": 3.0194, + "theoretical_loss": 3.5748468497342434, + "tokens_seen": 1247028224 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031423269809428286, + "loss": 2.8961, + "theoretical_loss": 3.5748296933242694, + "tokens_seen": 1247093760 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031422266800401204, + "loss": 3.0235, + "theoretical_loss": 3.5748125380682865, + "tokens_seen": 1247159296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003142126379137412, + "loss": 2.9833, + "theoretical_loss": 3.5747953839661566, + "tokens_seen": 1247224832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031420260782347045, + "loss": 2.7254, + "theoretical_loss": 3.5747782310177425, + "tokens_seen": 1247290368 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141925777331996, + "loss": 2.8447, + "theoretical_loss": 3.5747610792229048, + "tokens_seen": 1247355904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141825476429288, + "loss": 2.9133, + "theoretical_loss": 3.5747439285815057, + "tokens_seen": 1247421440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031417251755265794, + "loss": 2.7325, + "theoretical_loss": 3.5747267790934067, + "tokens_seen": 1247486976 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141624874623872, + "loss": 2.8761, + "theoretical_loss": 3.5747096307584707, + "tokens_seen": 1247552512 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031415245737211636, + "loss": 2.9412, + "theoretical_loss": 3.574692483576558, + "tokens_seen": 1247618048 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031414242728184554, + "loss": 2.9094, + "theoretical_loss": 3.574675337547532, + "tokens_seen": 1247683584 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141323971915747, + "loss": 2.9271, + "theoretical_loss": 3.5746581926712544, + "tokens_seen": 1247749120 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141223671013039, + "loss": 2.9645, + "theoretical_loss": 3.574641048947586, + "tokens_seen": 1247814656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141123370110331, + "loss": 2.8502, + "theoretical_loss": 3.57462390637639, + "tokens_seen": 1247880192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003141023069207623, + "loss": 2.7432, + "theoretical_loss": 3.574606764957528, + "tokens_seen": 1247945728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031409227683049145, + "loss": 2.7402, + "theoretical_loss": 3.574589624690862, + "tokens_seen": 1248011264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140822467402207, + "loss": 2.8975, + "theoretical_loss": 3.574572485576254, + "tokens_seen": 1248076800 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031407221664994986, + "loss": 2.9662, + "theoretical_loss": 3.5745553476135665, + "tokens_seen": 1248142336 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031406218655967904, + "loss": 2.8707, + "theoretical_loss": 3.574538210802661, + "tokens_seen": 1248207872 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140521564694082, + "loss": 2.7453, + "theoretical_loss": 3.5745210751434, + "tokens_seen": 1248273408 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140421263791374, + "loss": 2.9043, + "theoretical_loss": 3.574503940635645, + "tokens_seen": 1248338944 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140320962888666, + "loss": 2.6643, + "theoretical_loss": 3.5744868072792593, + "tokens_seen": 1248404480 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 1998704, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.842489004135132, + "objective/train/theoretical_loss": 3.574482524120048, + "objective/train/tokens_used": 1268880864, + "theoretical_loss": 3.574482524120048, + "tokens_seen": 1248420864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140220661985958, + "loss": 2.8119, + "theoretical_loss": 3.5744696750741043, + "tokens_seen": 1248470016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031401203610832495, + "loss": 2.7764, + "theoretical_loss": 3.5744525440200423, + "tokens_seen": 1248535552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003140020060180542, + "loss": 2.8473, + "theoretical_loss": 3.5744354141169357, + "tokens_seen": 1248601088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139919759277833, + "loss": 2.86, + "theoretical_loss": 3.574418285364647, + "tokens_seen": 1248666624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031398194583751255, + "loss": 2.7093, + "theoretical_loss": 3.574401157763038, + "tokens_seen": 1248732160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031397191574724173, + "loss": 2.7943, + "theoretical_loss": 3.5743840313119715, + "tokens_seen": 1248797696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139618856569709, + "loss": 2.9006, + "theoretical_loss": 3.5743669060113095, + "tokens_seen": 1248863232 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139518555667001, + "loss": 2.7037, + "theoretical_loss": 3.5743497818609145, + "tokens_seen": 1248928768 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139418254764293, + "loss": 2.7043, + "theoretical_loss": 3.5743326588606483, + "tokens_seen": 1248994304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031393179538615845, + "loss": 2.7921, + "theoretical_loss": 3.5743155370103743, + "tokens_seen": 1249059840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139217652958877, + "loss": 2.7724, + "theoretical_loss": 3.5742984163099543, + "tokens_seen": 1249125376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003139117352056168, + "loss": 2.8861, + "theoretical_loss": 3.574281296759251, + "tokens_seen": 1249190912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031390170511534605, + "loss": 2.787, + "theoretical_loss": 3.574264178358127, + "tokens_seen": 1249256448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031389167502507523, + "loss": 2.6743, + "theoretical_loss": 3.5742470611064445, + "tokens_seen": 1249321984 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138816449348044, + "loss": 2.9047, + "theoretical_loss": 3.574229945004066, + "tokens_seen": 1249387520 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138716148445336, + "loss": 2.7353, + "theoretical_loss": 3.5742128300508544, + "tokens_seen": 1249453056 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138615847542628, + "loss": 2.8911, + "theoretical_loss": 3.574195716246672, + "tokens_seen": 1249518592 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031385155466399196, + "loss": 2.7603, + "theoretical_loss": 3.574178603591381, + "tokens_seen": 1249584128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138415245737212, + "loss": 2.7568, + "theoretical_loss": 3.5741614920848455, + "tokens_seen": 1249649664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138314944834503, + "loss": 2.7515, + "theoretical_loss": 3.5741443817269265, + "tokens_seen": 1249715200 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031382146439317955, + "loss": 2.852, + "theoretical_loss": 3.5741272725174875, + "tokens_seen": 1249780736 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031381143430290873, + "loss": 2.8421, + "theoretical_loss": 3.574110164456391, + "tokens_seen": 1249846272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003138014042126379, + "loss": 2.8732, + "theoretical_loss": 3.5740930575434997, + "tokens_seen": 1249911808 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031379137412236715, + "loss": 2.7747, + "theoretical_loss": 3.574075951778676, + "tokens_seen": 1249977344 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137813440320963, + "loss": 2.8446, + "theoretical_loss": 3.5740588471617833, + "tokens_seen": 1250042880 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2001293, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828719139099121, + "objective/train/theoretical_loss": 3.5740545711869087, + "objective/train/tokens_used": 1270519264, + "theoretical_loss": 3.5740545711869087, + "tokens_seen": 1250059264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137713139418255, + "loss": 2.8842, + "theoretical_loss": 3.5740417436926846, + "tokens_seen": 1250108416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137612838515547, + "loss": 2.6829, + "theoretical_loss": 3.574024641371242, + "tokens_seen": 1250173952 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137512537612839, + "loss": 2.9561, + "theoretical_loss": 3.574007540197318, + "tokens_seen": 1250239488 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031374122367101306, + "loss": 2.9691, + "theoretical_loss": 3.5739904401707765, + "tokens_seen": 1250305024 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031373119358074224, + "loss": 2.8185, + "theoretical_loss": 3.57397334129148, + "tokens_seen": 1250370560 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137211634904714, + "loss": 2.9707, + "theoretical_loss": 3.573956243559291, + "tokens_seen": 1250436096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031371113340020065, + "loss": 2.7486, + "theoretical_loss": 3.573939146974073, + "tokens_seen": 1250501632 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003137011033099298, + "loss": 2.795, + "theoretical_loss": 3.5739220515356886, + "tokens_seen": 1250567168 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313691073219659, + "loss": 2.9643, + "theoretical_loss": 3.573904957244001, + "tokens_seen": 1250632704 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031368104312938814, + "loss": 2.9429, + "theoretical_loss": 3.573887864098873, + "tokens_seen": 1250698240 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003136710130391174, + "loss": 2.8627, + "theoretical_loss": 3.5738707721001672, + "tokens_seen": 1250763776 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031366098294884656, + "loss": 2.7699, + "theoretical_loss": 3.5738536812477477, + "tokens_seen": 1250829312 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031365095285857574, + "loss": 2.784, + "theoretical_loss": 3.573836591541477, + "tokens_seen": 1250894848 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003136409227683049, + "loss": 2.7728, + "theoretical_loss": 3.5738195029812183, + "tokens_seen": 1250960384 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003136308926780341, + "loss": 2.8701, + "theoretical_loss": 3.5738024155668344, + "tokens_seen": 1251025920 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003136208625877633, + "loss": 2.7795, + "theoretical_loss": 3.5737853292981887, + "tokens_seen": 1251091456 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003136108324974925, + "loss": 2.8851, + "theoretical_loss": 3.5737682441751444, + "tokens_seen": 1251156992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031360080240722165, + "loss": 2.9906, + "theoretical_loss": 3.573751160197565, + "tokens_seen": 1251222528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135907723169509, + "loss": 2.6918, + "theoretical_loss": 3.573734077365313, + "tokens_seen": 1251288064 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031358074222668006, + "loss": 2.9765, + "theoretical_loss": 3.573716995678252, + "tokens_seen": 1251353600 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031357071213640924, + "loss": 2.7654, + "theoretical_loss": 3.573699915136245, + "tokens_seen": 1251419136 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135606820461384, + "loss": 3.0233, + "theoretical_loss": 3.573682835739156, + "tokens_seen": 1251484672 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135506519558676, + "loss": 2.8926, + "theoretical_loss": 3.573665757486847, + "tokens_seen": 1251550208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135406218655968, + "loss": 2.6722, + "theoretical_loss": 3.5736486803791827, + "tokens_seen": 1251615744 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313530591775326, + "loss": 2.8182, + "theoretical_loss": 3.573631604416026, + "tokens_seen": 1251681280 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2004074, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0263476371765137, + "objective/train/theoretical_loss": 3.5736273356040504, + "objective/train/tokens_used": 1272157664, + "theoretical_loss": 3.5736273356040504, + "tokens_seen": 1251697664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031352056168505515, + "loss": 2.8549, + "theoretical_loss": 3.57361452959724, + "tokens_seen": 1251746816 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135105315947844, + "loss": 3.0458, + "theoretical_loss": 3.5735974559226884, + "tokens_seen": 1251812352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003135005015045135, + "loss": 2.8601, + "theoretical_loss": 3.5735803833922346, + "tokens_seen": 1251877888 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031349047141424275, + "loss": 2.8907, + "theoretical_loss": 3.5735633120057417, + "tokens_seen": 1251943424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031348044132397193, + "loss": 2.848, + "theoretical_loss": 3.5735462417630734, + "tokens_seen": 1252008960 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003134704112337011, + "loss": 2.7693, + "theoretical_loss": 3.5735291726640934, + "tokens_seen": 1252074496 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003134603811434303, + "loss": 2.9335, + "theoretical_loss": 3.5735121047086653, + "tokens_seen": 1252140032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003134503510531595, + "loss": 2.7026, + "theoretical_loss": 3.573495037896652, + "tokens_seen": 1252205568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031344032096288865, + "loss": 2.8494, + "theoretical_loss": 3.5734779722279173, + "tokens_seen": 1252271104 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003134302908726179, + "loss": 2.8547, + "theoretical_loss": 3.5734609077023256, + "tokens_seen": 1252336640 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313420260782347, + "loss": 2.8037, + "theoretical_loss": 3.573443844319739, + "tokens_seen": 1252402176 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031341023069207625, + "loss": 2.91, + "theoretical_loss": 3.5734267820800225, + "tokens_seen": 1252467712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031340020060180543, + "loss": 2.8359, + "theoretical_loss": 3.5734097209830393, + "tokens_seen": 1252533248 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133901705115346, + "loss": 2.7803, + "theoretical_loss": 3.5733926610286524, + "tokens_seen": 1252598784 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133801404212638, + "loss": 2.8241, + "theoretical_loss": 3.5733756022167267, + "tokens_seen": 1252664320 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313370110330993, + "loss": 2.8144, + "theoretical_loss": 3.5733585445471254, + "tokens_seen": 1252729856 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031336008024072216, + "loss": 2.7808, + "theoretical_loss": 3.573341488019712, + "tokens_seen": 1252795392 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133500501504514, + "loss": 2.8771, + "theoretical_loss": 3.57332443263435, + "tokens_seen": 1252860928 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133400200601805, + "loss": 2.8253, + "theoretical_loss": 3.573307378390904, + "tokens_seen": 1252926464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031332998996990975, + "loss": 2.6521, + "theoretical_loss": 3.573290325289238, + "tokens_seen": 1252992000 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133199598796389, + "loss": 2.8383, + "theoretical_loss": 3.5732732733292147, + "tokens_seen": 1253057536 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003133099297893681, + "loss": 2.8622, + "theoretical_loss": 3.573256222510699, + "tokens_seen": 1253123072 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132998996990973, + "loss": 2.8938, + "theoretical_loss": 3.573239172833554, + "tokens_seen": 1253188608 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132898696088265, + "loss": 2.7369, + "theoretical_loss": 3.5732221242976445, + "tokens_seen": 1253254144 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031327983951855566, + "loss": 2.759, + "theoretical_loss": 3.5732050769028336, + "tokens_seen": 1253319680 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2006919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.950580596923828, + "objective/train/theoretical_loss": 3.5732008152324117, + "objective/train/tokens_used": 1273796064, + "theoretical_loss": 3.5732008152324117, + "tokens_seen": 1253336064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132698094282849, + "loss": 2.9417, + "theoretical_loss": 3.5731880306489856, + "tokens_seen": 1253385216 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313259779338014, + "loss": 3.0198, + "theoretical_loss": 3.573170985535965, + "tokens_seen": 1253450752 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031324974924774326, + "loss": 2.8686, + "theoretical_loss": 3.5731539415636346, + "tokens_seen": 1253516288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132397191574724, + "loss": 2.8915, + "theoretical_loss": 3.5731368987318595, + "tokens_seen": 1253581824 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132296890672016, + "loss": 2.7949, + "theoretical_loss": 3.5731198570405036, + "tokens_seen": 1253647360 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003132196589769308, + "loss": 2.716, + "theoretical_loss": 3.5731028164894303, + "tokens_seen": 1253712896 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031320962888666, + "loss": 2.8964, + "theoretical_loss": 3.5730857770785045, + "tokens_seen": 1253778432 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031319959879638916, + "loss": 2.8987, + "theoretical_loss": 3.5730687388075895, + "tokens_seen": 1253843968 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031318956870611834, + "loss": 2.9752, + "theoretical_loss": 3.5730517016765506, + "tokens_seen": 1253909504 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003131795386158475, + "loss": 2.962, + "theoretical_loss": 3.5730346656852507, + "tokens_seen": 1253975040 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031316950852557676, + "loss": 2.9657, + "theoretical_loss": 3.5730176308335553, + "tokens_seen": 1254040576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003131594784353059, + "loss": 2.7956, + "theoretical_loss": 3.5730005971213274, + "tokens_seen": 1254106112 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003131494483450351, + "loss": 2.8948, + "theoretical_loss": 3.572983564548432, + "tokens_seen": 1254171648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031313941825476425, + "loss": 2.9357, + "theoretical_loss": 3.572966533114733, + "tokens_seen": 1254237184 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003131293881644935, + "loss": 2.9649, + "theoretical_loss": 3.572949502820095, + "tokens_seen": 1254302720 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031311935807422267, + "loss": 2.907, + "theoretical_loss": 3.572932473664382, + "tokens_seen": 1254368256 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031310932798395185, + "loss": 2.9548, + "theoretical_loss": 3.5729154456474586, + "tokens_seen": 1254433792 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031309929789368103, + "loss": 2.9386, + "theoretical_loss": 3.5728984187691886, + "tokens_seen": 1254499328 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031308926780341026, + "loss": 2.7953, + "theoretical_loss": 3.572881393029437, + "tokens_seen": 1254564864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130792377131394, + "loss": 2.8838, + "theoretical_loss": 3.5728643684280685, + "tokens_seen": 1254630400 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130692076228686, + "loss": 2.8154, + "theoretical_loss": 3.5728473449649467, + "tokens_seen": 1254695936 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130591775325978, + "loss": 2.8589, + "theoretical_loss": 3.572830322639936, + "tokens_seen": 1254761472 + }, + { + "epoch": 3.07, + "learning_rate": 0.000313049147442327, + "loss": 2.8834, + "theoretical_loss": 3.5728133014529018, + "tokens_seen": 1254827008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130391173520562, + "loss": 2.8263, + "theoretical_loss": 3.5727962814037078, + "tokens_seen": 1254892544 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031302908726178535, + "loss": 3.0888, + "theoretical_loss": 3.5727792624922188, + "tokens_seen": 1254958080 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2009646, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9422459602355957, + "objective/train/theoretical_loss": 3.572775007942097, + "objective/train/tokens_used": 1275434464, + "theoretical_loss": 3.572775007942097, + "tokens_seen": 1254974464 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130190571715146, + "loss": 2.9068, + "theoretical_loss": 3.572762244718299, + "tokens_seen": 1255023616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003130090270812437, + "loss": 2.843, + "theoretical_loss": 3.572745228081814, + "tokens_seen": 1255089152 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031299899699097295, + "loss": 2.8873, + "theoretical_loss": 3.572728212582627, + "tokens_seen": 1255154688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031298896690070213, + "loss": 2.8622, + "theoretical_loss": 3.5727111982206035, + "tokens_seen": 1255220224 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003129789368104313, + "loss": 2.846, + "theoretical_loss": 3.5726941849956084, + "tokens_seen": 1255285760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003129689067201605, + "loss": 2.8629, + "theoretical_loss": 3.5726771729075053, + "tokens_seen": 1255351296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003129588766298897, + "loss": 2.759, + "theoretical_loss": 3.5726601619561595, + "tokens_seen": 1255416832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031294884653961885, + "loss": 2.7818, + "theoretical_loss": 3.5726431521414357, + "tokens_seen": 1255482368 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003129388164493481, + "loss": 2.9275, + "theoretical_loss": 3.572626143463199, + "tokens_seen": 1255547904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003129287863590772, + "loss": 2.8326, + "theoretical_loss": 3.5726091359213132, + "tokens_seen": 1255613440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031291875626880645, + "loss": 2.9994, + "theoretical_loss": 3.572592129515644, + "tokens_seen": 1255678976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031290872617853563, + "loss": 2.888, + "theoretical_loss": 3.5725751242460557, + "tokens_seen": 1255744512 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128986960882648, + "loss": 2.8197, + "theoretical_loss": 3.5725581201124132, + "tokens_seen": 1255810048 + }, + { + "epoch": 3.07, + "learning_rate": 0.000312888665997994, + "loss": 2.9003, + "theoretical_loss": 3.5725411171145813, + "tokens_seen": 1255875584 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128786359077232, + "loss": 2.9083, + "theoretical_loss": 3.5725241152524254, + "tokens_seen": 1255941120 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031286860581745236, + "loss": 2.7384, + "theoretical_loss": 3.5725071145258096, + "tokens_seen": 1256006656 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128585757271816, + "loss": 2.8066, + "theoretical_loss": 3.572490114934599, + "tokens_seen": 1256072192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128485456369107, + "loss": 2.7543, + "theoretical_loss": 3.572473116478659, + "tokens_seen": 1256137728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031283851554663995, + "loss": 2.5997, + "theoretical_loss": 3.572456119157854, + "tokens_seen": 1256203264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128284854563691, + "loss": 2.9942, + "theoretical_loss": 3.5724391229720496, + "tokens_seen": 1256268800 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128184553660983, + "loss": 2.8819, + "theoretical_loss": 3.5724221279211106, + "tokens_seen": 1256334336 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003128084252758275, + "loss": 2.9361, + "theoretical_loss": 3.5724051340049012, + "tokens_seen": 1256399872 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127983951855567, + "loss": 2.9755, + "theoretical_loss": 3.5723881412232874, + "tokens_seen": 1256465408 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031278836509528586, + "loss": 2.8902, + "theoretical_loss": 3.572371149576134, + "tokens_seen": 1256530944 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127783350050151, + "loss": 2.8176, + "theoretical_loss": 3.5723541590633063, + "tokens_seen": 1256596480 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2011066, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.79646372795105, + "objective/train/theoretical_loss": 3.5723499116123216, + "objective/train/tokens_used": 1277072864, + "theoretical_loss": 3.5723499116123216, + "tokens_seen": 1256612864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127683049147442, + "loss": 2.9352, + "theoretical_loss": 3.572337169684669, + "tokens_seen": 1256662016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031275827482447346, + "loss": 2.8744, + "theoretical_loss": 3.572320181440088, + "tokens_seen": 1256727552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127482447342026, + "loss": 2.7914, + "theoretical_loss": 3.572303194329427, + "tokens_seen": 1256793088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127382146439318, + "loss": 2.775, + "theoretical_loss": 3.572286208352553, + "tokens_seen": 1256858624 + }, + { + "epoch": 3.07, + "learning_rate": 0.000312728184553661, + "loss": 2.7584, + "theoretical_loss": 3.5722692235093296, + "tokens_seen": 1256924160 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003127181544633902, + "loss": 2.8768, + "theoretical_loss": 3.5722522397996235, + "tokens_seen": 1256989696 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031270812437311936, + "loss": 2.7886, + "theoretical_loss": 3.5722352572232987, + "tokens_seen": 1257055232 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031269809428284854, + "loss": 2.8053, + "theoretical_loss": 3.572218275780221, + "tokens_seen": 1257120768 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003126880641925777, + "loss": 2.7695, + "theoretical_loss": 3.572201295470256, + "tokens_seen": 1257186304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031267803410230696, + "loss": 2.7162, + "theoretical_loss": 3.572184316293269, + "tokens_seen": 1257251840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003126680040120361, + "loss": 2.916, + "theoretical_loss": 3.572167338249125, + "tokens_seen": 1257317376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003126579739217653, + "loss": 2.8695, + "theoretical_loss": 3.5721503613376893, + "tokens_seen": 1257382912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031264794383149445, + "loss": 2.737, + "theoretical_loss": 3.5721333855588275, + "tokens_seen": 1257448448 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003126379137412237, + "loss": 2.9598, + "theoretical_loss": 3.572116410912405, + "tokens_seen": 1257513984 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031262788365095287, + "loss": 2.7592, + "theoretical_loss": 3.572099437398287, + "tokens_seen": 1257579520 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031261785356068205, + "loss": 2.8924, + "theoretical_loss": 3.572082465016339, + "tokens_seen": 1257645056 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031260782347041123, + "loss": 2.8757, + "theoretical_loss": 3.5720654937664276, + "tokens_seen": 1257710592 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031259779338014046, + "loss": 2.781, + "theoretical_loss": 3.572048523648417, + "tokens_seen": 1257776128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003125877632898696, + "loss": 2.5344, + "theoretical_loss": 3.572031554662173, + "tokens_seen": 1257841664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003125777331995988, + "loss": 2.8578, + "theoretical_loss": 3.572014586807562, + "tokens_seen": 1257907200 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031256770310932795, + "loss": 2.8797, + "theoretical_loss": 3.5719976200844483, + "tokens_seen": 1257972736 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003125576730190572, + "loss": 3.0118, + "theoretical_loss": 3.5719806544926977, + "tokens_seen": 1258038272 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031254764292878637, + "loss": 2.7535, + "theoretical_loss": 3.5719636900321765, + "tokens_seen": 1258103808 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031253761283851555, + "loss": 2.7873, + "theoretical_loss": 3.571946726702751, + "tokens_seen": 1258169344 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031252758274824473, + "loss": 2.8568, + "theoretical_loss": 3.5719297645042847, + "tokens_seen": 1258234880 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2014802, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.884423017501831, + "objective/train/theoretical_loss": 3.571925524131365, + "objective/train/tokens_used": 1278711264, + "theoretical_loss": 3.571925524131365, + "tokens_seen": 1258251264 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003125175526579739, + "loss": 2.8544, + "theoretical_loss": 3.5719128034366454, + "tokens_seen": 1258300416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003125075225677031, + "loss": 2.8604, + "theoretical_loss": 3.5718958434996972, + "tokens_seen": 1258365952 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031249749247743233, + "loss": 2.7181, + "theoretical_loss": 3.571878884693307, + "tokens_seen": 1258431488 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031248746238716146, + "loss": 2.9391, + "theoretical_loss": 3.5718619270173404, + "tokens_seen": 1258497024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003124774322968907, + "loss": 2.7915, + "theoretical_loss": 3.5718449704716626, + "tokens_seen": 1258562560 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003124674022066198, + "loss": 2.9672, + "theoretical_loss": 3.5718280150561403, + "tokens_seen": 1258628096 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031245737211634905, + "loss": 2.7716, + "theoretical_loss": 3.5718110607706386, + "tokens_seen": 1258693632 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031244734202607823, + "loss": 2.8121, + "theoretical_loss": 3.5717941076150233, + "tokens_seen": 1258759168 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003124373119358074, + "loss": 2.9259, + "theoretical_loss": 3.571777155589161, + "tokens_seen": 1258824704 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003124272818455366, + "loss": 2.7256, + "theoretical_loss": 3.5717602046929167, + "tokens_seen": 1258890240 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031241725175526583, + "loss": 2.7422, + "theoretical_loss": 3.571743254926157, + "tokens_seen": 1258955776 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031240722166499496, + "loss": 2.8353, + "theoretical_loss": 3.5717263062887477, + "tokens_seen": 1259021312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123971915747242, + "loss": 2.8685, + "theoretical_loss": 3.5717093587805544, + "tokens_seen": 1259086848 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123871614844533, + "loss": 2.8213, + "theoretical_loss": 3.5716924124014433, + "tokens_seen": 1259152384 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031237713139418256, + "loss": 2.7264, + "theoretical_loss": 3.571675467151281, + "tokens_seen": 1259217920 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031236710130391174, + "loss": 2.7526, + "theoretical_loss": 3.571658523029933, + "tokens_seen": 1259283456 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123570712136409, + "loss": 2.8609, + "theoretical_loss": 3.5716415800372654, + "tokens_seen": 1259348992 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123470411233701, + "loss": 2.7471, + "theoretical_loss": 3.571624638173144, + "tokens_seen": 1259414528 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123370110330993, + "loss": 2.7918, + "theoretical_loss": 3.5716076974374356, + "tokens_seen": 1259480064 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031232698094282846, + "loss": 2.9651, + "theoretical_loss": 3.5715907578300055, + "tokens_seen": 1259545600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123169508525577, + "loss": 2.9718, + "theoretical_loss": 3.5715738193507205, + "tokens_seen": 1259611136 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003123069207622869, + "loss": 2.8773, + "theoretical_loss": 3.571556881999447, + "tokens_seen": 1259676672 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031229689067201606, + "loss": 2.9746, + "theoretical_loss": 3.57153994577605, + "tokens_seen": 1259742208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003122868605817453, + "loss": 2.9049, + "theoretical_loss": 3.571523010680397, + "tokens_seen": 1259807744 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003122768304914744, + "loss": 2.8866, + "theoretical_loss": 3.571506076712354, + "tokens_seen": 1259873280 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2016318, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9251017570495605, + "objective/train/theoretical_loss": 3.571501843396516, + "objective/train/tokens_used": 1280349664, + "theoretical_loss": 3.571501843396516, + "tokens_seen": 1259889664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031226680040120366, + "loss": 2.8447, + "theoretical_loss": 3.571489143871786, + "tokens_seen": 1259938816 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003122567703109328, + "loss": 2.669, + "theoretical_loss": 3.571472212158562, + "tokens_seen": 1260004352 + }, + { + "epoch": 3.07, + "learning_rate": 0.000312246740220662, + "loss": 2.8645, + "theoretical_loss": 3.571455281572545, + "tokens_seen": 1260069888 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003122367101303912, + "loss": 2.7414, + "theoretical_loss": 3.5714383521136037, + "tokens_seen": 1260135424 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003122266800401204, + "loss": 2.9223, + "theoretical_loss": 3.5714214237816035, + "tokens_seen": 1260200960 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031221664994984956, + "loss": 2.9917, + "theoretical_loss": 3.5714044965764113, + "tokens_seen": 1260266496 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031220661985957874, + "loss": 2.8098, + "theoretical_loss": 3.571387570497893, + "tokens_seen": 1260332032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003121965897693079, + "loss": 2.833, + "theoretical_loss": 3.5713706455459153, + "tokens_seen": 1260397568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031218655967903716, + "loss": 2.7996, + "theoretical_loss": 3.571353721720344, + "tokens_seen": 1260463104 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003121765295887663, + "loss": 2.8027, + "theoretical_loss": 3.571336799021047, + "tokens_seen": 1260528640 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003121664994984955, + "loss": 2.8141, + "theoretical_loss": 3.5713198774478894, + "tokens_seen": 1260594176 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031215646940822465, + "loss": 2.8351, + "theoretical_loss": 3.571302957000739, + "tokens_seen": 1260659712 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003121464393179539, + "loss": 2.9491, + "theoretical_loss": 3.571286037679461, + "tokens_seen": 1260725248 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031213640922768307, + "loss": 2.8494, + "theoretical_loss": 3.5712691194839232, + "tokens_seen": 1260790784 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031212637913741225, + "loss": 2.9185, + "theoretical_loss": 3.5712522024139908, + "tokens_seen": 1260856320 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031211634904714143, + "loss": 2.6853, + "theoretical_loss": 3.5712352864695314, + "tokens_seen": 1260921856 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031210631895687066, + "loss": 2.6759, + "theoretical_loss": 3.571218371650412, + "tokens_seen": 1260987392 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003120962888665998, + "loss": 2.8353, + "theoretical_loss": 3.571201457956498, + "tokens_seen": 1261052928 + }, + { + "epoch": 3.07, + "learning_rate": 0.000312086258776329, + "loss": 2.9097, + "theoretical_loss": 3.5711845453876565, + "tokens_seen": 1261118464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031207622868605815, + "loss": 2.8749, + "theoretical_loss": 3.5711676339437552, + "tokens_seen": 1261184000 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003120661985957874, + "loss": 2.7922, + "theoretical_loss": 3.57115072362466, + "tokens_seen": 1261249536 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031205616850551657, + "loss": 2.9258, + "theoretical_loss": 3.5711338144302376, + "tokens_seen": 1261315072 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031204613841524575, + "loss": 2.9527, + "theoretical_loss": 3.5711169063603547, + "tokens_seen": 1261380608 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031203610832497493, + "loss": 2.8443, + "theoretical_loss": 3.571099999414878, + "tokens_seen": 1261446144 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003120260782347041, + "loss": 2.804, + "theoretical_loss": 3.5710830935936757, + "tokens_seen": 1261511680 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2018834, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5431337356567383, + "objective/train/theoretical_loss": 3.571078867314027, + "objective/train/tokens_used": 1281988064, + "theoretical_loss": 3.571078867314027, + "tokens_seen": 1261528064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003120160481444333, + "loss": 2.716, + "theoretical_loss": 3.5710661888966126, + "tokens_seen": 1261577216 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031200601805416253, + "loss": 2.8323, + "theoretical_loss": 3.571049285323557, + "tokens_seen": 1261642752 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031199598796389166, + "loss": 2.9333, + "theoretical_loss": 3.5710323828743746, + "tokens_seen": 1261708288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003119859578736209, + "loss": 2.8145, + "theoretical_loss": 3.571015481548934, + "tokens_seen": 1261773824 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031197592778335, + "loss": 2.9461, + "theoretical_loss": 3.5709985813471006, + "tokens_seen": 1261839360 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031196589769307925, + "loss": 2.7873, + "theoretical_loss": 3.5709816822687417, + "tokens_seen": 1261904896 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031195586760280843, + "loss": 2.8632, + "theoretical_loss": 3.5709647843137247, + "tokens_seen": 1261970432 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003119458375125376, + "loss": 2.7447, + "theoretical_loss": 3.570947887481916, + "tokens_seen": 1262035968 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003119358074222668, + "loss": 2.811, + "theoretical_loss": 3.5709309917731833, + "tokens_seen": 1262101504 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031192577733199603, + "loss": 2.7637, + "theoretical_loss": 3.5709140971873934, + "tokens_seen": 1262167040 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031191574724172516, + "loss": 2.904, + "theoretical_loss": 3.570897203724413, + "tokens_seen": 1262232576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003119057171514544, + "loss": 2.8187, + "theoretical_loss": 3.5708803113841094, + "tokens_seen": 1262298112 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003118956870611835, + "loss": 2.7835, + "theoretical_loss": 3.57086342016635, + "tokens_seen": 1262363648 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031188565697091276, + "loss": 2.9036, + "theoretical_loss": 3.5708465300710017, + "tokens_seen": 1262429184 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031187562688064194, + "loss": 2.7859, + "theoretical_loss": 3.5708296410979314, + "tokens_seen": 1262494720 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003118655967903711, + "loss": 2.828, + "theoretical_loss": 3.5708127532470066, + "tokens_seen": 1262560256 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003118555667001003, + "loss": 2.8157, + "theoretical_loss": 3.5707958665180946, + "tokens_seen": 1262625792 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003118455366098295, + "loss": 2.8206, + "theoretical_loss": 3.5707789809110624, + "tokens_seen": 1262691328 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031183550651955866, + "loss": 2.9608, + "theoretical_loss": 3.570762096425777, + "tokens_seen": 1262756864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003118254764292879, + "loss": 2.7984, + "theoretical_loss": 3.5707452130621062, + "tokens_seen": 1262822400 + }, + { + "epoch": 3.07, + "learning_rate": 0.000311815446339017, + "loss": 2.7535, + "theoretical_loss": 3.570728330819917, + "tokens_seen": 1262887936 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031180541624874626, + "loss": 2.735, + "theoretical_loss": 3.5707114496990773, + "tokens_seen": 1262953472 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117953861584754, + "loss": 2.7888, + "theoretical_loss": 3.5706945696994534, + "tokens_seen": 1263019008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117853560682046, + "loss": 2.9296, + "theoretical_loss": 3.5706776908209132, + "tokens_seen": 1263084544 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117753259779338, + "loss": 2.7984, + "theoretical_loss": 3.570660813063324, + "tokens_seen": 1263150080 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2021599, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.977271795272827, + "objective/train/theoretical_loss": 3.5706565937990598, + "objective/train/tokens_used": 1283626464, + "theoretical_loss": 3.5706565937990598, + "tokens_seen": 1263166464 + }, + { + "epoch": 3.07, + "learning_rate": 0.000311765295887663, + "loss": 2.8715, + "theoretical_loss": 3.570643936426553, + "tokens_seen": 1263215616 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031175526579739217, + "loss": 2.9489, + "theoretical_loss": 3.5706270609104687, + "tokens_seen": 1263281152 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117452357071214, + "loss": 2.6122, + "theoretical_loss": 3.570610186514937, + "tokens_seen": 1263346688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031173520561685053, + "loss": 2.8758, + "theoretical_loss": 3.5705933132398266, + "tokens_seen": 1263412224 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031172517552657976, + "loss": 2.9186, + "theoretical_loss": 3.570576441085004, + "tokens_seen": 1263477760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117151454363089, + "loss": 2.7566, + "theoretical_loss": 3.5705595700503374, + "tokens_seen": 1263543296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003117051153460381, + "loss": 2.7967, + "theoretical_loss": 3.570542700135694, + "tokens_seen": 1263608832 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003116950852557673, + "loss": 2.9344, + "theoretical_loss": 3.570525831340942, + "tokens_seen": 1263674368 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003116850551654965, + "loss": 3.0083, + "theoretical_loss": 3.570508963665948, + "tokens_seen": 1263739904 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031167502507522567, + "loss": 2.8453, + "theoretical_loss": 3.5704920971105802, + "tokens_seen": 1263805440 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031166499498495485, + "loss": 2.7463, + "theoretical_loss": 3.570475231674706, + "tokens_seen": 1263870976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031165496489468403, + "loss": 2.824, + "theoretical_loss": 3.570458367358193, + "tokens_seen": 1263936512 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031164493480441327, + "loss": 2.772, + "theoretical_loss": 3.57044150416091, + "tokens_seen": 1264002048 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003116349047141424, + "loss": 2.6643, + "theoretical_loss": 3.570424642082723, + "tokens_seen": 1264067584 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031162487462387163, + "loss": 3.0156, + "theoretical_loss": 3.5704077811235004, + "tokens_seen": 1264133120 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003116148445336008, + "loss": 2.8298, + "theoretical_loss": 3.5703909212831104, + "tokens_seen": 1264198656 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031160481444333, + "loss": 2.8453, + "theoretical_loss": 3.57037406256142, + "tokens_seen": 1264264192 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031159478435305917, + "loss": 2.853, + "theoretical_loss": 3.570357204958298, + "tokens_seen": 1264329728 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031158475426278835, + "loss": 2.8114, + "theoretical_loss": 3.5703403484736107, + "tokens_seen": 1264395264 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031157472417251753, + "loss": 2.8733, + "theoretical_loss": 3.570323493107227, + "tokens_seen": 1264460800 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031156469408224677, + "loss": 2.7852, + "theoretical_loss": 3.570306638859015, + "tokens_seen": 1264526336 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031155466399197595, + "loss": 2.8142, + "theoretical_loss": 3.570289785728842, + "tokens_seen": 1264591872 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031154463390170513, + "loss": 2.748, + "theoretical_loss": 3.5702729337165757, + "tokens_seen": 1264657408 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003115346038114343, + "loss": 2.9957, + "theoretical_loss": 3.570256082822085, + "tokens_seen": 1264722944 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003115245737211635, + "loss": 2.8968, + "theoretical_loss": 3.5702392330452364, + "tokens_seen": 1264788480 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2024598, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0535192489624023, + "objective/train/theoretical_loss": 3.57023502077564, + "objective/train/tokens_used": 1285264864, + "theoretical_loss": 3.57023502077564, + "tokens_seen": 1264804864 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031151454363089273, + "loss": 2.856, + "theoretical_loss": 3.5702223843858993, + "tokens_seen": 1264854016 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031150451354062186, + "loss": 2.7509, + "theoretical_loss": 3.5702055368439405, + "tokens_seen": 1264919552 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003114944834503511, + "loss": 3.0034, + "theoretical_loss": 3.5701886904192284, + "tokens_seen": 1264985088 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003114844533600802, + "loss": 3.0388, + "theoretical_loss": 3.570171845111632, + "tokens_seen": 1265050624 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031147442326980945, + "loss": 2.9901, + "theoretical_loss": 3.570155000921018, + "tokens_seen": 1265116160 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031146439317953863, + "loss": 2.8428, + "theoretical_loss": 3.5701381578472553, + "tokens_seen": 1265181696 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003114543630892678, + "loss": 2.7746, + "theoretical_loss": 3.570121315890211, + "tokens_seen": 1265247232 + }, + { + "epoch": 3.07, + "learning_rate": 0.000311444332998997, + "loss": 2.7767, + "theoretical_loss": 3.570104475049755, + "tokens_seen": 1265312768 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031143430290872623, + "loss": 2.8383, + "theoretical_loss": 3.570087635325754, + "tokens_seen": 1265378304 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031142427281845536, + "loss": 2.8891, + "theoretical_loss": 3.5700707967180763, + "tokens_seen": 1265443840 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003114142427281846, + "loss": 2.8114, + "theoretical_loss": 3.5700539592265903, + "tokens_seen": 1265509376 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003114042126379137, + "loss": 2.819, + "theoretical_loss": 3.570037122851165, + "tokens_seen": 1265574912 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031139418254764296, + "loss": 2.7334, + "theoretical_loss": 3.5700202875916673, + "tokens_seen": 1265640448 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031138415245737214, + "loss": 2.9693, + "theoretical_loss": 3.570003453447966, + "tokens_seen": 1265705984 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113741223671013, + "loss": 2.939, + "theoretical_loss": 3.5699866204199298, + "tokens_seen": 1265771520 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113640922768305, + "loss": 2.8425, + "theoretical_loss": 3.569969788507426, + "tokens_seen": 1265837056 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113540621865597, + "loss": 2.9611, + "theoretical_loss": 3.569952957710324, + "tokens_seen": 1265902592 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031134403209628886, + "loss": 2.8478, + "theoretical_loss": 3.569936128028492, + "tokens_seen": 1265968128 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113340020060181, + "loss": 2.8093, + "theoretical_loss": 3.569919299461797, + "tokens_seen": 1266033664 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113239719157472, + "loss": 2.8434, + "theoretical_loss": 3.56990247201011, + "tokens_seen": 1266099200 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031131394182547646, + "loss": 2.8436, + "theoretical_loss": 3.569885645673297, + "tokens_seen": 1266164736 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003113039117352056, + "loss": 2.7374, + "theoretical_loss": 3.5698688204512274, + "tokens_seen": 1266230272 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112938816449348, + "loss": 2.8227, + "theoretical_loss": 3.5698519963437696, + "tokens_seen": 1266295808 + }, + { + "epoch": 3.07, + "learning_rate": 0.000311283851554664, + "loss": 2.6112, + "theoretical_loss": 3.569835173350792, + "tokens_seen": 1266361344 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112738214643932, + "loss": 3.0445, + "theoretical_loss": 3.569818351472163, + "tokens_seen": 1266426880 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2027075, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9765138626098633, + "objective/train/theoretical_loss": 3.5698141461766077, + "objective/train/tokens_used": 1286903264, + "theoretical_loss": 3.5698141461766077, + "tokens_seen": 1266443264 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031126379137412237, + "loss": 2.9028, + "theoretical_loss": 3.5698015307077515, + "tokens_seen": 1266492416 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112537612838516, + "loss": 2.871, + "theoretical_loss": 3.569784711057426, + "tokens_seen": 1266557952 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031124373119358073, + "loss": 2.837, + "theoretical_loss": 3.569767892521055, + "tokens_seen": 1266623488 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031123370110330996, + "loss": 2.72, + "theoretical_loss": 3.5697510750985066, + "tokens_seen": 1266689024 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112236710130391, + "loss": 2.8855, + "theoretical_loss": 3.5697342587896497, + "tokens_seen": 1266754560 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112136409227683, + "loss": 2.66, + "theoretical_loss": 3.569717443594354, + "tokens_seen": 1266820096 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003112036108324975, + "loss": 2.9083, + "theoretical_loss": 3.569700629512486, + "tokens_seen": 1266885632 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003111935807422267, + "loss": 2.771, + "theoretical_loss": 3.5696838165439164, + "tokens_seen": 1266951168 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031118355065195587, + "loss": 2.7095, + "theoretical_loss": 3.5696670046885126, + "tokens_seen": 1267016704 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031117352056168505, + "loss": 2.9879, + "theoretical_loss": 3.569650193946144, + "tokens_seen": 1267082240 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031116349047141423, + "loss": 2.7884, + "theoretical_loss": 3.5696333843166794, + "tokens_seen": 1267147776 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031115346038114347, + "loss": 2.604, + "theoretical_loss": 3.569616575799987, + "tokens_seen": 1267213312 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003111434302908726, + "loss": 2.8565, + "theoretical_loss": 3.5695997683959364, + "tokens_seen": 1267278848 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031113340020060183, + "loss": 2.9879, + "theoretical_loss": 3.5695829621043953, + "tokens_seen": 1267344384 + }, + { + "epoch": 3.07, + "learning_rate": 0.000311123370110331, + "loss": 2.7479, + "theoretical_loss": 3.5695661569252337, + "tokens_seen": 1267409920 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003111133400200602, + "loss": 2.7787, + "theoretical_loss": 3.5695493528583198, + "tokens_seen": 1267475456 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031110330992978937, + "loss": 2.814, + "theoretical_loss": 3.5695325499035224, + "tokens_seen": 1267540992 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031109327983951855, + "loss": 2.8884, + "theoretical_loss": 3.569515748060711, + "tokens_seen": 1267606528 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031108324974924773, + "loss": 2.9798, + "theoretical_loss": 3.5694989473297536, + "tokens_seen": 1267672064 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031107321965897697, + "loss": 2.9293, + "theoretical_loss": 3.56948214771052, + "tokens_seen": 1267737600 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003110631895687061, + "loss": 2.8816, + "theoretical_loss": 3.569465349202879, + "tokens_seen": 1267803136 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031105315947843533, + "loss": 2.7278, + "theoretical_loss": 3.5694485518066994, + "tokens_seen": 1267868672 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031104312938816446, + "loss": 2.7818, + "theoretical_loss": 3.5694317555218507, + "tokens_seen": 1267934208 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003110330992978937, + "loss": 2.9907, + "theoretical_loss": 3.569414960348201, + "tokens_seen": 1267999744 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003110230692076229, + "loss": 2.5487, + "theoretical_loss": 3.56939816628562, + "tokens_seen": 1268065280 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2028563, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0797383785247803, + "objective/train/theoretical_loss": 3.569393967943564, + "objective/train/tokens_used": 1288541664, + "theoretical_loss": 3.569393967943564, + "tokens_seen": 1268081664 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031101303911735206, + "loss": 2.9569, + "theoretical_loss": 3.569381373333977, + "tokens_seen": 1268130816 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031100300902708124, + "loss": 2.752, + "theoretical_loss": 3.56936458149314, + "tokens_seen": 1268196352 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109929789368104, + "loss": 2.8699, + "theoretical_loss": 3.5693477907629796, + "tokens_seen": 1268261888 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109829488465396, + "loss": 2.7927, + "theoretical_loss": 3.5693310011433637, + "tokens_seen": 1268327424 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031097291875626883, + "loss": 2.7069, + "theoretical_loss": 3.5693142126341626, + "tokens_seen": 1268392960 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031096288866599796, + "loss": 2.7311, + "theoretical_loss": 3.5692974252352445, + "tokens_seen": 1268458496 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109528585757272, + "loss": 2.8543, + "theoretical_loss": 3.5692806389464793, + "tokens_seen": 1268524032 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109428284854564, + "loss": 2.8212, + "theoretical_loss": 3.569263853767736, + "tokens_seen": 1268589568 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031093279839518556, + "loss": 2.7236, + "theoretical_loss": 3.569247069698883, + "tokens_seen": 1268655104 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031092276830491474, + "loss": 2.9884, + "theoretical_loss": 3.569230286739791, + "tokens_seen": 1268720640 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109127382146439, + "loss": 2.8975, + "theoretical_loss": 3.569213504890329, + "tokens_seen": 1268786176 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003109027081243731, + "loss": 2.9404, + "theoretical_loss": 3.569196724150366, + "tokens_seen": 1268851712 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031089267803410234, + "loss": 2.8128, + "theoretical_loss": 3.569179944519771, + "tokens_seen": 1268917248 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031088264794383147, + "loss": 2.8078, + "theoretical_loss": 3.5691631659984138, + "tokens_seen": 1268982784 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003108726178535607, + "loss": 2.6804, + "theoretical_loss": 3.569146388586164, + "tokens_seen": 1269048320 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031086258776328983, + "loss": 2.8276, + "theoretical_loss": 3.5691296122828904, + "tokens_seen": 1269113856 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031085255767301906, + "loss": 2.8567, + "theoretical_loss": 3.569112837088463, + "tokens_seen": 1269179392 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031084252758274824, + "loss": 2.8967, + "theoretical_loss": 3.569096063002751, + "tokens_seen": 1269244928 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003108324974924774, + "loss": 2.6483, + "theoretical_loss": 3.5690792900256243, + "tokens_seen": 1269310464 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003108224674022066, + "loss": 2.6858, + "theoretical_loss": 3.5690625181569513, + "tokens_seen": 1269376000 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003108124373119358, + "loss": 2.6025, + "theoretical_loss": 3.5690457473966024, + "tokens_seen": 1269441536 + }, + { + "epoch": 3.07, + "learning_rate": 0.000310802407221665, + "loss": 2.8361, + "theoretical_loss": 3.5690289777444475, + "tokens_seen": 1269507072 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107923771313942, + "loss": 2.7998, + "theoretical_loss": 3.5690122092003556, + "tokens_seen": 1269572608 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107823470411234, + "loss": 2.956, + "theoretical_loss": 3.568995441764196, + "tokens_seen": 1269638144 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031077231695085257, + "loss": 2.9067, + "theoretical_loss": 3.568978675435839, + "tokens_seen": 1269703680 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2031593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7910854816436768, + "objective/train/theoretical_loss": 3.5689744840268283, + "objective/train/tokens_used": 1290180064, + "theoretical_loss": 3.5689744840268283, + "tokens_seen": 1269720064 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107622868605818, + "loss": 2.8132, + "theoretical_loss": 3.5689619102151537, + "tokens_seen": 1269769216 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031075225677031093, + "loss": 2.7747, + "theoretical_loss": 3.56894514610201, + "tokens_seen": 1269834752 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031074222668004016, + "loss": 2.8624, + "theoretical_loss": 3.5689283830962775, + "tokens_seen": 1269900288 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107321965897693, + "loss": 2.8389, + "theoretical_loss": 3.568911621197826, + "tokens_seen": 1269965824 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107221664994985, + "loss": 2.8508, + "theoretical_loss": 3.5688948604065254, + "tokens_seen": 1270031360 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107121364092277, + "loss": 2.767, + "theoretical_loss": 3.568878100722245, + "tokens_seen": 1270096896 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003107021063189569, + "loss": 2.7497, + "theoretical_loss": 3.5688613421448547, + "tokens_seen": 1270162432 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031069207622868607, + "loss": 2.8244, + "theoretical_loss": 3.5688445846742245, + "tokens_seen": 1270227968 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031068204613841525, + "loss": 2.7155, + "theoretical_loss": 3.568827828310224, + "tokens_seen": 1270293504 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031067201604814443, + "loss": 2.8605, + "theoretical_loss": 3.568811073052723, + "tokens_seen": 1270359040 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031066198595787367, + "loss": 2.9057, + "theoretical_loss": 3.568794318901592, + "tokens_seen": 1270424576 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003106519558676028, + "loss": 2.8501, + "theoretical_loss": 3.5687775658567, + "tokens_seen": 1270490112 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031064192577733203, + "loss": 2.7854, + "theoretical_loss": 3.568760813917917, + "tokens_seen": 1270555648 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003106318956870612, + "loss": 2.8184, + "theoretical_loss": 3.5687440630851137, + "tokens_seen": 1270621184 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003106218655967904, + "loss": 2.9737, + "theoretical_loss": 3.568727313358159, + "tokens_seen": 1270686720 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031061183550651957, + "loss": 2.6612, + "theoretical_loss": 3.568710564736924, + "tokens_seen": 1270752256 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031060180541624875, + "loss": 2.7472, + "theoretical_loss": 3.568693817221278, + "tokens_seen": 1270817792 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031059177532597793, + "loss": 2.871, + "theoretical_loss": 3.5686770708110904, + "tokens_seen": 1270883328 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031058174523570717, + "loss": 2.896, + "theoretical_loss": 3.568660325506232, + "tokens_seen": 1270948864 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003105717151454363, + "loss": 2.9461, + "theoretical_loss": 3.5686435813065733, + "tokens_seen": 1271014400 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031056168505516553, + "loss": 2.6493, + "theoretical_loss": 3.568626838211984, + "tokens_seen": 1271079936 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031055165496489466, + "loss": 2.8699, + "theoretical_loss": 3.5686100962223337, + "tokens_seen": 1271145472 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003105416248746239, + "loss": 2.8153, + "theoretical_loss": 3.5685933553374927, + "tokens_seen": 1271211008 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003105315947843531, + "loss": 2.7417, + "theoretical_loss": 3.5685766155573315, + "tokens_seen": 1271276544 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031052156469408226, + "loss": 2.6989, + "theoretical_loss": 3.56855987688172, + "tokens_seen": 1271342080 + }, + { + "epoch": 3.07, + "objective/train/docs_used": 2034445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8023555278778076, + "objective/train/theoretical_loss": 3.568555692385388, + "objective/train/tokens_used": 1291818464, + "theoretical_loss": 3.568555692385388, + "tokens_seen": 1271358464 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031051153460381144, + "loss": 2.8523, + "theoretical_loss": 3.5685431393105285, + "tokens_seen": 1271407616 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003105015045135406, + "loss": 3.0045, + "theoretical_loss": 3.568526402843627, + "tokens_seen": 1271473152 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003104914744232698, + "loss": 2.8946, + "theoretical_loss": 3.5685096674808863, + "tokens_seen": 1271538688 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031048144433299904, + "loss": 2.8554, + "theoretical_loss": 3.568492933222176, + "tokens_seen": 1271604224 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031047141424272816, + "loss": 2.7871, + "theoretical_loss": 3.5684762000673667, + "tokens_seen": 1271669760 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003104613841524574, + "loss": 2.9869, + "theoretical_loss": 3.5684594680163286, + "tokens_seen": 1271735296 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003104513540621866, + "loss": 2.9184, + "theoretical_loss": 3.5684427370689313, + "tokens_seen": 1271800832 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031044132397191576, + "loss": 2.6751, + "theoretical_loss": 3.568426007225047, + "tokens_seen": 1271866368 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031043129388164494, + "loss": 2.9767, + "theoretical_loss": 3.5684092784845443, + "tokens_seen": 1271931904 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003104212637913741, + "loss": 2.9216, + "theoretical_loss": 3.5683925508472942, + "tokens_seen": 1271997440 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003104112337011033, + "loss": 2.9935, + "theoretical_loss": 3.568375824313167, + "tokens_seen": 1272062976 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031040120361083254, + "loss": 2.7421, + "theoretical_loss": 3.5683590988820333, + "tokens_seen": 1272128512 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031039117352056167, + "loss": 2.9623, + "theoretical_loss": 3.568342374553764, + "tokens_seen": 1272194048 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003103811434302909, + "loss": 2.859, + "theoretical_loss": 3.5683256513282284, + "tokens_seen": 1272259584 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031037111334002003, + "loss": 2.7587, + "theoretical_loss": 3.5683089292052985, + "tokens_seen": 1272325120 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031036108324974926, + "loss": 2.8878, + "theoretical_loss": 3.568292208184843, + "tokens_seen": 1272390656 + }, + { + "epoch": 3.07, + "learning_rate": 0.00031035105315947844, + "loss": 2.8037, + "theoretical_loss": 3.5682754882667345, + "tokens_seen": 1272456192 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003103410230692076, + "loss": 2.8874, + "theoretical_loss": 3.5682587694508414, + "tokens_seen": 1272521728 + }, + { + "epoch": 3.07, + "learning_rate": 0.0003103309929789368, + "loss": 2.7953, + "theoretical_loss": 3.5682420517370357, + "tokens_seen": 1272587264 + }, + { + "epoch": 3.08, + "learning_rate": 0.000310320962888666, + "loss": 2.866, + "theoretical_loss": 3.568225335125188, + "tokens_seen": 1272652800 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031031093279839517, + "loss": 2.8363, + "theoretical_loss": 3.568208619615168, + "tokens_seen": 1272718336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003103009027081244, + "loss": 2.7918, + "theoretical_loss": 3.568191905206847, + "tokens_seen": 1272783872 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031029087261785353, + "loss": 2.8358, + "theoretical_loss": 3.568175191900096, + "tokens_seen": 1272849408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031028084252758277, + "loss": 2.6179, + "theoretical_loss": 3.5681584796947847, + "tokens_seen": 1272914944 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031027081243731195, + "loss": 2.8112, + "theoretical_loss": 3.5681417685907846, + "tokens_seen": 1272980480 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2037271, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9106063842773438, + "objective/train/theoretical_loss": 3.5681375909868494, + "objective/train/tokens_used": 1293456864, + "theoretical_loss": 3.5681375909868494, + "tokens_seen": 1272996864 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031026078234704113, + "loss": 2.7681, + "theoretical_loss": 3.5681250585879667, + "tokens_seen": 1273046016 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003102507522567703, + "loss": 2.8585, + "theoretical_loss": 3.5681083496862005, + "tokens_seen": 1273111552 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003102407221664995, + "loss": 2.8424, + "theoretical_loss": 3.568091641885358, + "tokens_seen": 1273177088 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031023069207622867, + "loss": 2.7924, + "theoretical_loss": 3.5680749351853094, + "tokens_seen": 1273242624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003102206619859579, + "loss": 2.9542, + "theoretical_loss": 3.5680582295859256, + "tokens_seen": 1273308160 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031021063189568703, + "loss": 2.9366, + "theoretical_loss": 3.5680415250870774, + "tokens_seen": 1273373696 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031020060180541627, + "loss": 2.8124, + "theoretical_loss": 3.568024821688636, + "tokens_seen": 1273439232 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003101905717151454, + "loss": 2.7876, + "theoretical_loss": 3.5680081193904725, + "tokens_seen": 1273504768 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031018054162487463, + "loss": 2.7887, + "theoretical_loss": 3.5679914181924564, + "tokens_seen": 1273570304 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003101705115346038, + "loss": 2.9496, + "theoretical_loss": 3.56797471809446, + "tokens_seen": 1273635840 + }, + { + "epoch": 3.08, + "learning_rate": 0.000310160481444333, + "loss": 2.6854, + "theoretical_loss": 3.567958019096354, + "tokens_seen": 1273701376 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003101504513540622, + "loss": 2.8969, + "theoretical_loss": 3.567941321198009, + "tokens_seen": 1273766912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003101404212637914, + "loss": 2.6938, + "theoretical_loss": 3.5679246243992964, + "tokens_seen": 1273832448 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031013039117352054, + "loss": 2.6761, + "theoretical_loss": 3.5679079287000866, + "tokens_seen": 1273897984 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031012036108324977, + "loss": 2.8597, + "theoretical_loss": 3.5678912341002516, + "tokens_seen": 1273963520 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003101103309929789, + "loss": 2.9379, + "theoretical_loss": 3.5678745405996617, + "tokens_seen": 1274029056 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031010030090270813, + "loss": 2.8431, + "theoretical_loss": 3.567857848198188, + "tokens_seen": 1274094592 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100902708124373, + "loss": 2.734, + "theoretical_loss": 3.5678411568957022, + "tokens_seen": 1274160128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100802407221665, + "loss": 2.7718, + "theoretical_loss": 3.567824466692074, + "tokens_seen": 1274225664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100702106318957, + "loss": 2.8619, + "theoretical_loss": 3.567807777587177, + "tokens_seen": 1274291200 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031006018054162486, + "loss": 2.866, + "theoretical_loss": 3.56779108958088, + "tokens_seen": 1274356736 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100501504513541, + "loss": 2.7139, + "theoretical_loss": 3.567774402673056, + "tokens_seen": 1274422272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100401203610833, + "loss": 2.7851, + "theoretical_loss": 3.567757716863574, + "tokens_seen": 1274487808 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031003009027081246, + "loss": 2.7445, + "theoretical_loss": 3.5677410321523078, + "tokens_seen": 1274553344 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031002006018054164, + "loss": 2.7944, + "theoretical_loss": 3.567724348539127, + "tokens_seen": 1274618880 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2040189, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8895514011383057, + "objective/train/theoretical_loss": 3.5677201778073924, + "objective/train/tokens_used": 1295095264, + "theoretical_loss": 3.5677201778073924, + "tokens_seen": 1274635264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003100100300902708, + "loss": 2.7262, + "theoretical_loss": 3.567707666023903, + "tokens_seen": 1274684416 + }, + { + "epoch": 3.08, + "learning_rate": 0.00031, + "loss": 2.8398, + "theoretical_loss": 3.567690984606508, + "tokens_seen": 1274749952 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030998996990972924, + "loss": 2.914, + "theoretical_loss": 3.567674304286812, + "tokens_seen": 1274815488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030997993981945836, + "loss": 2.8236, + "theoretical_loss": 3.5676576250646876, + "tokens_seen": 1274881024 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003099699097291876, + "loss": 2.7844, + "theoretical_loss": 3.5676409469400054, + "tokens_seen": 1274946560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003099598796389168, + "loss": 2.9285, + "theoretical_loss": 3.5676242699126375, + "tokens_seen": 1275012096 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030994984954864596, + "loss": 2.7194, + "theoretical_loss": 3.567607593982454, + "tokens_seen": 1275077632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030993981945837514, + "loss": 2.9015, + "theoretical_loss": 3.5675909191493282, + "tokens_seen": 1275143168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003099297893681043, + "loss": 2.679, + "theoretical_loss": 3.56757424541313, + "tokens_seen": 1275208704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003099197592778335, + "loss": 2.8374, + "theoretical_loss": 3.567557572773731, + "tokens_seen": 1275274240 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030990972918756274, + "loss": 2.8148, + "theoretical_loss": 3.5675409012310038, + "tokens_seen": 1275339776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030989969909729187, + "loss": 2.7862, + "theoretical_loss": 3.5675242307848185, + "tokens_seen": 1275405312 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003098896690070211, + "loss": 2.8589, + "theoretical_loss": 3.5675075614350478, + "tokens_seen": 1275470848 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030987963891675023, + "loss": 2.8357, + "theoretical_loss": 3.567490893181562, + "tokens_seen": 1275536384 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030986960882647946, + "loss": 2.8439, + "theoretical_loss": 3.567474226024234, + "tokens_seen": 1275601920 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030985957873620864, + "loss": 2.7427, + "theoretical_loss": 3.567457559962935, + "tokens_seen": 1275667456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003098495486459378, + "loss": 2.8006, + "theoretical_loss": 3.567440894997536, + "tokens_seen": 1275732992 + }, + { + "epoch": 3.08, + "learning_rate": 0.000309839518555667, + "loss": 2.8906, + "theoretical_loss": 3.56742423112791, + "tokens_seen": 1275798528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003098294884653962, + "loss": 2.748, + "theoretical_loss": 3.5674075683539272, + "tokens_seen": 1275864064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030981945837512537, + "loss": 2.8763, + "theoretical_loss": 3.56739090667546, + "tokens_seen": 1275929600 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003098094282848546, + "loss": 2.9016, + "theoretical_loss": 3.5673742460923803, + "tokens_seen": 1275995136 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030979939819458373, + "loss": 2.8845, + "theoretical_loss": 3.567357586604559, + "tokens_seen": 1276060672 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030978936810431297, + "loss": 2.7414, + "theoretical_loss": 3.5673409282118684, + "tokens_seen": 1276126208 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030977933801404215, + "loss": 2.9098, + "theoretical_loss": 3.56732427091418, + "tokens_seen": 1276191744 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030976930792377133, + "loss": 2.7466, + "theoretical_loss": 3.5673076147113667, + "tokens_seen": 1276257280 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2042188, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.932711601257324, + "objective/train/theoretical_loss": 3.5673034508317225, + "objective/train/tokens_used": 1296733664, + "theoretical_loss": 3.5673034508317225, + "tokens_seen": 1276273664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003097592778335005, + "loss": 2.9874, + "theoretical_loss": 3.567290959603299, + "tokens_seen": 1276322816 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003097492477432297, + "loss": 2.8431, + "theoretical_loss": 3.5672743055898497, + "tokens_seen": 1276388352 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030973921765295887, + "loss": 2.5813, + "theoretical_loss": 3.5672576526708895, + "tokens_seen": 1276453888 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003097291875626881, + "loss": 2.8899, + "theoretical_loss": 3.567241000846291, + "tokens_seen": 1276519424 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030971915747241723, + "loss": 2.8121, + "theoretical_loss": 3.567224350115927, + "tokens_seen": 1276584960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030970912738214647, + "loss": 2.6809, + "theoretical_loss": 3.5672077004796674, + "tokens_seen": 1276650496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003096990972918756, + "loss": 2.723, + "theoretical_loss": 3.5671910519373853, + "tokens_seen": 1276716032 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030968906720160483, + "loss": 2.841, + "theoretical_loss": 3.567174404488953, + "tokens_seen": 1276781568 + }, + { + "epoch": 3.08, + "learning_rate": 0.000309679037111334, + "loss": 2.6688, + "theoretical_loss": 3.5671577581342424, + "tokens_seen": 1276847104 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003096690070210632, + "loss": 2.5445, + "theoretical_loss": 3.5671411128731245, + "tokens_seen": 1276912640 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003096589769307924, + "loss": 2.9244, + "theoretical_loss": 3.5671244687054724, + "tokens_seen": 1276978176 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003096489468405216, + "loss": 2.7798, + "theoretical_loss": 3.5671078256311577, + "tokens_seen": 1277043712 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030963891675025074, + "loss": 2.8469, + "theoretical_loss": 3.567091183650053, + "tokens_seen": 1277109248 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030962888665997997, + "loss": 2.7414, + "theoretical_loss": 3.5670745427620294, + "tokens_seen": 1277174784 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003096188565697091, + "loss": 2.8812, + "theoretical_loss": 3.56705790296696, + "tokens_seen": 1277240320 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030960882647943833, + "loss": 2.8646, + "theoretical_loss": 3.567041264264716, + "tokens_seen": 1277305856 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095987963891675, + "loss": 2.7264, + "theoretical_loss": 3.5670246266551704, + "tokens_seen": 1277371392 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095887662988967, + "loss": 2.8967, + "theoretical_loss": 3.567007990138195, + "tokens_seen": 1277436928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095787362086259, + "loss": 2.7364, + "theoretical_loss": 3.5669913547136627, + "tokens_seen": 1277502464 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030956870611835506, + "loss": 2.8344, + "theoretical_loss": 3.5669747203814444, + "tokens_seen": 1277568000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030955867602808424, + "loss": 2.9608, + "theoretical_loss": 3.5669580871414133, + "tokens_seen": 1277633536 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095486459378135, + "loss": 3.0561, + "theoretical_loss": 3.566941454993441, + "tokens_seen": 1277699072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095386158475426, + "loss": 2.8255, + "theoretical_loss": 3.5669248239374007, + "tokens_seen": 1277764608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030952858575727184, + "loss": 2.7334, + "theoretical_loss": 3.5669081939731644, + "tokens_seen": 1277830144 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030951855566700096, + "loss": 2.7197, + "theoretical_loss": 3.5668915651006037, + "tokens_seen": 1277895680 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2045095, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4064037799835205, + "objective/train/theoretical_loss": 3.5668874080530233, + "objective/train/tokens_used": 1298372064, + "theoretical_loss": 3.5668874080530233, + "tokens_seen": 1277912064 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003095085255767302, + "loss": 2.7933, + "theoretical_loss": 3.566874937319592, + "tokens_seen": 1277961216 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003094984954864594, + "loss": 2.7513, + "theoretical_loss": 3.566858310630001, + "tokens_seen": 1278026752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030948846539618856, + "loss": 2.7101, + "theoretical_loss": 3.5668416850317035, + "tokens_seen": 1278092288 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030947843530591774, + "loss": 2.866, + "theoretical_loss": 3.5668250605245717, + "tokens_seen": 1278157824 + }, + { + "epoch": 3.08, + "learning_rate": 0.000309468405215647, + "loss": 2.6921, + "theoretical_loss": 3.566808437108478, + "tokens_seen": 1278223360 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003094583751253761, + "loss": 2.8237, + "theoretical_loss": 3.566791814783295, + "tokens_seen": 1278288896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030944834503510534, + "loss": 2.6994, + "theoretical_loss": 3.566775193548895, + "tokens_seen": 1278354432 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030943831494483447, + "loss": 2.8198, + "theoretical_loss": 3.5667585734051506, + "tokens_seen": 1278419968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003094282848545637, + "loss": 2.7629, + "theoretical_loss": 3.5667419543519343, + "tokens_seen": 1278485504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003094182547642929, + "loss": 2.8232, + "theoretical_loss": 3.566725336389119, + "tokens_seen": 1278551040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030940822467402207, + "loss": 2.6644, + "theoretical_loss": 3.5667087195165763, + "tokens_seen": 1278616576 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030939819458375125, + "loss": 2.699, + "theoretical_loss": 3.56669210373418, + "tokens_seen": 1278682112 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030938816449348043, + "loss": 2.8094, + "theoretical_loss": 3.5666754890418026, + "tokens_seen": 1278747648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003093781344032096, + "loss": 2.8965, + "theoretical_loss": 3.566658875439316, + "tokens_seen": 1278813184 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030936810431293884, + "loss": 2.8444, + "theoretical_loss": 3.566642262926593, + "tokens_seen": 1278878720 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030935807422266797, + "loss": 2.7434, + "theoretical_loss": 3.5666256515035064, + "tokens_seen": 1278944256 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003093480441323972, + "loss": 2.6846, + "theoretical_loss": 3.566609041169929, + "tokens_seen": 1279009792 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030933801404212633, + "loss": 2.7648, + "theoretical_loss": 3.5665924319257334, + "tokens_seen": 1279075328 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030932798395185557, + "loss": 2.8441, + "theoretical_loss": 3.5665758237707923, + "tokens_seen": 1279140864 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030931795386158475, + "loss": 2.7107, + "theoretical_loss": 3.566559216704979, + "tokens_seen": 1279206400 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030930792377131393, + "loss": 2.944, + "theoretical_loss": 3.566542610728166, + "tokens_seen": 1279271936 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030929789368104317, + "loss": 2.8119, + "theoretical_loss": 3.5665260058402253, + "tokens_seen": 1279337472 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030928786359077235, + "loss": 2.8021, + "theoretical_loss": 3.5665094020410306, + "tokens_seen": 1279403008 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030927783350050153, + "loss": 2.8992, + "theoretical_loss": 3.5664927993304545, + "tokens_seen": 1279468544 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003092678034102307, + "loss": 2.831, + "theoretical_loss": 3.5664761977083703, + "tokens_seen": 1279534080 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2047951, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.902060031890869, + "objective/train/theoretical_loss": 3.566472047472911, + "objective/train/tokens_used": 1300010464, + "theoretical_loss": 3.566472047472911, + "tokens_seen": 1279550464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003092577733199599, + "loss": 2.6236, + "theoretical_loss": 3.56645959717465, + "tokens_seen": 1279599616 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030924774322968907, + "loss": 2.6897, + "theoretical_loss": 3.5664429977291676, + "tokens_seen": 1279665152 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003092377131394183, + "loss": 2.8492, + "theoretical_loss": 3.566426399371795, + "tokens_seen": 1279730688 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030922768304914743, + "loss": 2.8107, + "theoretical_loss": 3.566409802102406, + "tokens_seen": 1279796224 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030921765295887667, + "loss": 2.9061, + "theoretical_loss": 3.566393205920873, + "tokens_seen": 1279861760 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003092076228686058, + "loss": 2.6489, + "theoretical_loss": 3.566376610827069, + "tokens_seen": 1279927296 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030919759277833503, + "loss": 2.5212, + "theoretical_loss": 3.5663600168208673, + "tokens_seen": 1279992832 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091875626880642, + "loss": 3.0069, + "theoretical_loss": 3.5663434239021408, + "tokens_seen": 1280058368 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091775325977934, + "loss": 2.9228, + "theoretical_loss": 3.5663268320707626, + "tokens_seen": 1280123904 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091675025075226, + "loss": 2.7277, + "theoretical_loss": 3.5663102413266063, + "tokens_seen": 1280189440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091574724172518, + "loss": 2.8987, + "theoretical_loss": 3.566293651669544, + "tokens_seen": 1280254976 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030914744232698094, + "loss": 2.8832, + "theoretical_loss": 3.5662770630994496, + "tokens_seen": 1280320512 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030913741223671017, + "loss": 2.6437, + "theoretical_loss": 3.5662604756161955, + "tokens_seen": 1280386048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091273821464393, + "loss": 2.7695, + "theoretical_loss": 3.5662438892196553, + "tokens_seen": 1280451584 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030911735205616853, + "loss": 2.8363, + "theoretical_loss": 3.5662273039097028, + "tokens_seen": 1280517120 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003091073219658977, + "loss": 2.748, + "theoretical_loss": 3.56621071968621, + "tokens_seen": 1280582656 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090972918756269, + "loss": 2.8038, + "theoretical_loss": 3.5661941365490515, + "tokens_seen": 1280648192 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090872617853561, + "loss": 2.779, + "theoretical_loss": 3.5661775544980996, + "tokens_seen": 1280713728 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030907723169508526, + "loss": 2.8946, + "theoretical_loss": 3.5661609735332274, + "tokens_seen": 1280779264 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030906720160481444, + "loss": 2.837, + "theoretical_loss": 3.566144393654309, + "tokens_seen": 1280844800 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090571715145437, + "loss": 2.7987, + "theoretical_loss": 3.5661278148612174, + "tokens_seen": 1280910336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090471414242728, + "loss": 2.7577, + "theoretical_loss": 3.5661112371538257, + "tokens_seen": 1280975872 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030903711133400204, + "loss": 2.7609, + "theoretical_loss": 3.5660946605320074, + "tokens_seen": 1281041408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030902708124373116, + "loss": 2.7984, + "theoretical_loss": 3.566078084995636, + "tokens_seen": 1281106944 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090170511534604, + "loss": 2.7983, + "theoretical_loss": 3.5660615105445848, + "tokens_seen": 1281172480 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2050720, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.798025608062744, + "objective/train/theoretical_loss": 3.566057367101388, + "objective/train/tokens_used": 1301648864, + "theoretical_loss": 3.566057367101388, + "tokens_seen": 1281188864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003090070210631896, + "loss": 2.8116, + "theoretical_loss": 3.566044937178727, + "tokens_seen": 1281238016 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030899699097291876, + "loss": 2.8427, + "theoretical_loss": 3.566028364897936, + "tokens_seen": 1281303552 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030898696088264794, + "loss": 2.7973, + "theoretical_loss": 3.5660117937020863, + "tokens_seen": 1281369088 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003089769307923772, + "loss": 2.9045, + "theoretical_loss": 3.5659952235910506, + "tokens_seen": 1281434624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003089669007021063, + "loss": 2.8706, + "theoretical_loss": 3.565978654564702, + "tokens_seen": 1281500160 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030895687061183554, + "loss": 2.8345, + "theoretical_loss": 3.565962086622914, + "tokens_seen": 1281565696 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030894684052156467, + "loss": 2.861, + "theoretical_loss": 3.5659455197655614, + "tokens_seen": 1281631232 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003089368104312939, + "loss": 2.6906, + "theoretical_loss": 3.565928953992517, + "tokens_seen": 1281696768 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003089267803410231, + "loss": 2.8631, + "theoretical_loss": 3.565912389303654, + "tokens_seen": 1281762304 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030891675025075227, + "loss": 2.6867, + "theoretical_loss": 3.5658958256988464, + "tokens_seen": 1281827840 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030890672016048145, + "loss": 2.9033, + "theoretical_loss": 3.565879263177968, + "tokens_seen": 1281893376 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030889669007021063, + "loss": 2.8221, + "theoretical_loss": 3.5658627017408926, + "tokens_seen": 1281958912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003088866599799398, + "loss": 2.7063, + "theoretical_loss": 3.5658461413874933, + "tokens_seen": 1282024448 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030887662988966904, + "loss": 2.8374, + "theoretical_loss": 3.565829582117644, + "tokens_seen": 1282089984 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030886659979939817, + "loss": 3.0059, + "theoretical_loss": 3.5658130239312182, + "tokens_seen": 1282155520 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003088565697091274, + "loss": 2.7664, + "theoretical_loss": 3.5657964668280906, + "tokens_seen": 1282221056 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030884653961885653, + "loss": 2.7954, + "theoretical_loss": 3.565779910808134, + "tokens_seen": 1282286592 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030883650952858577, + "loss": 2.8494, + "theoretical_loss": 3.5657633558712227, + "tokens_seen": 1282352128 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030882647943831495, + "loss": 2.9633, + "theoretical_loss": 3.5657468020172294, + "tokens_seen": 1282417664 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030881644934804413, + "loss": 3.0178, + "theoretical_loss": 3.56573024924603, + "tokens_seen": 1282483200 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003088064192577733, + "loss": 2.6769, + "theoretical_loss": 3.5657136975574963, + "tokens_seen": 1282548736 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030879638916750255, + "loss": 2.7021, + "theoretical_loss": 3.5656971469515035, + "tokens_seen": 1282614272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003087863590772317, + "loss": 2.645, + "theoretical_loss": 3.565680597427925, + "tokens_seen": 1282679808 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003087763289869609, + "loss": 2.9027, + "theoretical_loss": 3.5656640489866342, + "tokens_seen": 1282745344 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030876629889669004, + "loss": 2.6862, + "theoretical_loss": 3.565647501627506, + "tokens_seen": 1282810880 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2053595, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8137669563293457, + "objective/train/theoretical_loss": 3.5656433649567973, + "objective/train/tokens_used": 1303287264, + "theoretical_loss": 3.5656433649567973, + "tokens_seen": 1282827264 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030875626880641927, + "loss": 2.7832, + "theoretical_loss": 3.5656309553504144, + "tokens_seen": 1282876416 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030874623871614845, + "loss": 2.8722, + "theoretical_loss": 3.5656144101552325, + "tokens_seen": 1282941952 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030873620862587763, + "loss": 2.6227, + "theoretical_loss": 3.565597866041834, + "tokens_seen": 1283007488 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003087261785356068, + "loss": 2.8628, + "theoretical_loss": 3.5655813230100946, + "tokens_seen": 1283073024 + }, + { + "epoch": 3.08, + "learning_rate": 0.000308716148445336, + "loss": 2.9369, + "theoretical_loss": 3.5655647810598867, + "tokens_seen": 1283138560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003087061183550652, + "loss": 2.8674, + "theoretical_loss": 3.5655482401910854, + "tokens_seen": 1283204096 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086960882647944, + "loss": 2.8512, + "theoretical_loss": 3.565531700403564, + "tokens_seen": 1283269632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030868605817452354, + "loss": 2.9047, + "theoretical_loss": 3.5655151616971974, + "tokens_seen": 1283335168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086760280842528, + "loss": 2.9447, + "theoretical_loss": 3.565498624071859, + "tokens_seen": 1283400704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086659979939819, + "loss": 2.587, + "theoretical_loss": 3.5654820875274233, + "tokens_seen": 1283466240 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030865596790371114, + "loss": 2.8568, + "theoretical_loss": 3.565465552063765, + "tokens_seen": 1283531776 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086459378134403, + "loss": 2.7699, + "theoretical_loss": 3.565449017680757, + "tokens_seen": 1283597312 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086359077231695, + "loss": 3.007, + "theoretical_loss": 3.5654324843782743, + "tokens_seen": 1283662848 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086258776328987, + "loss": 2.9153, + "theoretical_loss": 3.565415952156191, + "tokens_seen": 1283728384 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003086158475426279, + "loss": 2.7305, + "theoretical_loss": 3.565399421014382, + "tokens_seen": 1283793920 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030860581745235704, + "loss": 2.8762, + "theoretical_loss": 3.56538289095272, + "tokens_seen": 1283859456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003085957873620863, + "loss": 2.8774, + "theoretical_loss": 3.5653663619710807, + "tokens_seen": 1283924992 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003085857572718154, + "loss": 2.9342, + "theoretical_loss": 3.5653498340693384, + "tokens_seen": 1283990528 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030857572718154464, + "loss": 2.7192, + "theoretical_loss": 3.5653333072473665, + "tokens_seen": 1284056064 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003085656970912738, + "loss": 2.7261, + "theoretical_loss": 3.56531678150504, + "tokens_seen": 1284121600 + }, + { + "epoch": 3.08, + "learning_rate": 0.000308555667001003, + "loss": 2.895, + "theoretical_loss": 3.565300256842233, + "tokens_seen": 1284187136 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030854563691073224, + "loss": 2.7976, + "theoretical_loss": 3.56528373325882, + "tokens_seen": 1284252672 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030853560682046137, + "loss": 2.7798, + "theoretical_loss": 3.5652672107546755, + "tokens_seen": 1284318208 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003085255767301906, + "loss": 2.8155, + "theoretical_loss": 3.565250689329674, + "tokens_seen": 1284383744 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003085155466399198, + "loss": 2.982, + "theoretical_loss": 3.5652341689836895, + "tokens_seen": 1284449280 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2056231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.647995948791504, + "objective/train/theoretical_loss": 3.565230039065775, + "objective/train/tokens_used": 1304925664, + "theoretical_loss": 3.565230039065775, + "tokens_seen": 1284465664 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030850551654964896, + "loss": 2.6968, + "theoretical_loss": 3.5652176497165975, + "tokens_seen": 1284514816 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030849548645937814, + "loss": 2.7403, + "theoretical_loss": 3.565201131528271, + "tokens_seen": 1284580352 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003084854563691074, + "loss": 2.8255, + "theoretical_loss": 3.5651846144185857, + "tokens_seen": 1284645888 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003084754262788365, + "loss": 2.9317, + "theoretical_loss": 3.565168098387416, + "tokens_seen": 1284711424 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030846539618856574, + "loss": 2.681, + "theoretical_loss": 3.565151583434636, + "tokens_seen": 1284776960 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030845536609829487, + "loss": 2.9565, + "theoretical_loss": 3.565135069560121, + "tokens_seen": 1284842496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003084453360080241, + "loss": 2.7568, + "theoretical_loss": 3.5651185567637445, + "tokens_seen": 1284908032 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003084353059177533, + "loss": 2.8272, + "theoretical_loss": 3.565102045045382, + "tokens_seen": 1284973568 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030842527582748247, + "loss": 2.8857, + "theoretical_loss": 3.5650855344049077, + "tokens_seen": 1285039104 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030841524573721165, + "loss": 2.9589, + "theoretical_loss": 3.565069024842196, + "tokens_seen": 1285104640 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030840521564694083, + "loss": 2.8515, + "theoretical_loss": 3.565052516357123, + "tokens_seen": 1285170176 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030839518555667, + "loss": 2.8295, + "theoretical_loss": 3.565036008949562, + "tokens_seen": 1285235712 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030838515546639924, + "loss": 2.7376, + "theoretical_loss": 3.565019502619389, + "tokens_seen": 1285301248 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030837512537612837, + "loss": 2.6822, + "theoretical_loss": 3.565002997366477, + "tokens_seen": 1285366784 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003083650952858576, + "loss": 2.8662, + "theoretical_loss": 3.5649864931907023, + "tokens_seen": 1285432320 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030835506519558673, + "loss": 2.8145, + "theoretical_loss": 3.5649699900919387, + "tokens_seen": 1285497856 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030834503510531597, + "loss": 2.7228, + "theoretical_loss": 3.564953488070062, + "tokens_seen": 1285563392 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030833500501504515, + "loss": 2.6897, + "theoretical_loss": 3.564936987124946, + "tokens_seen": 1285628928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030832497492477433, + "loss": 2.8515, + "theoretical_loss": 3.564920487256466, + "tokens_seen": 1285694464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003083149448345035, + "loss": 2.8343, + "theoretical_loss": 3.564903988464498, + "tokens_seen": 1285760000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030830491474423275, + "loss": 2.778, + "theoretical_loss": 3.5648874907489145, + "tokens_seen": 1285825536 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003082948846539619, + "loss": 2.7934, + "theoretical_loss": 3.564870994109592, + "tokens_seen": 1285891072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003082848545636911, + "loss": 2.8446, + "theoretical_loss": 3.5648544985464055, + "tokens_seen": 1285956608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030827482447342024, + "loss": 2.748, + "theoretical_loss": 3.56483800405923, + "tokens_seen": 1286022144 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030826479438314947, + "loss": 2.7204, + "theoretical_loss": 3.5648215106479393, + "tokens_seen": 1286087680 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2057685, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.59145188331604, + "objective/train/theoretical_loss": 3.5648173874632096, + "objective/train/tokens_used": 1306564064, + "theoretical_loss": 3.5648173874632096, + "tokens_seen": 1286104064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030825476429287865, + "loss": 2.8994, + "theoretical_loss": 3.56480501831241, + "tokens_seen": 1286153216 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030824473420260783, + "loss": 2.8357, + "theoretical_loss": 3.564788527052516, + "tokens_seen": 1286218752 + }, + { + "epoch": 3.08, + "learning_rate": 0.000308234704112337, + "loss": 2.9047, + "theoretical_loss": 3.5647720368681326, + "tokens_seen": 1286284288 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003082246740220662, + "loss": 2.7496, + "theoretical_loss": 3.564755547759135, + "tokens_seen": 1286349824 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003082146439317954, + "loss": 2.7686, + "theoretical_loss": 3.5647390597253983, + "tokens_seen": 1286415360 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003082046138415246, + "loss": 2.782, + "theoretical_loss": 3.5647225727667973, + "tokens_seen": 1286480896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030819458375125374, + "loss": 2.8925, + "theoretical_loss": 3.564706086883208, + "tokens_seen": 1286546432 + }, + { + "epoch": 3.08, + "learning_rate": 0.000308184553660983, + "loss": 2.8727, + "theoretical_loss": 3.5646896020745045, + "tokens_seen": 1286611968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081745235707121, + "loss": 2.8051, + "theoretical_loss": 3.5646731183405627, + "tokens_seen": 1286677504 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030816449348044134, + "loss": 2.7983, + "theoretical_loss": 3.5646566356812572, + "tokens_seen": 1286743040 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081544633901705, + "loss": 2.7153, + "theoretical_loss": 3.564640154096464, + "tokens_seen": 1286808576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081444332998997, + "loss": 2.8053, + "theoretical_loss": 3.5646236735860573, + "tokens_seen": 1286874112 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081344032096289, + "loss": 2.8973, + "theoretical_loss": 3.564607194149913, + "tokens_seen": 1286939648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081243731193581, + "loss": 2.8325, + "theoretical_loss": 3.564590715787907, + "tokens_seen": 1287005184 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030811434302908724, + "loss": 2.7299, + "theoretical_loss": 3.564574238499913, + "tokens_seen": 1287070720 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003081043129388165, + "loss": 2.675, + "theoretical_loss": 3.5645577622858076, + "tokens_seen": 1287136256 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003080942828485456, + "loss": 2.8618, + "theoretical_loss": 3.564541287145466, + "tokens_seen": 1287201792 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030808425275827484, + "loss": 2.7774, + "theoretical_loss": 3.5645248130787626, + "tokens_seen": 1287267328 + }, + { + "epoch": 3.08, + "learning_rate": 0.000308074222668004, + "loss": 2.8154, + "theoretical_loss": 3.564508340085574, + "tokens_seen": 1287332864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003080641925777332, + "loss": 2.658, + "theoretical_loss": 3.5644918681657747, + "tokens_seen": 1287398400 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003080541624874624, + "loss": 2.7455, + "theoretical_loss": 3.5644753973192405, + "tokens_seen": 1287463936 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030804413239719157, + "loss": 2.9139, + "theoretical_loss": 3.5644589275458474, + "tokens_seen": 1287529472 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030803410230692075, + "loss": 2.7482, + "theoretical_loss": 3.5644424588454697, + "tokens_seen": 1287595008 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030802407221665, + "loss": 2.8401, + "theoretical_loss": 3.564425991217984, + "tokens_seen": 1287660544 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003080140421263791, + "loss": 2.7801, + "theoretical_loss": 3.5644095246632648, + "tokens_seen": 1287726080 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2060564, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9187965393066406, + "objective/train/theoretical_loss": 3.56440540819219, + "objective/train/tokens_used": 1308202464, + "theoretical_loss": 3.56440540819219, + "tokens_seen": 1287742464 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030800401203610834, + "loss": 2.8576, + "theoretical_loss": 3.5643930591811883, + "tokens_seen": 1287791616 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079939819458375, + "loss": 2.8187, + "theoretical_loss": 3.56437659477163, + "tokens_seen": 1287857152 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079839518555667, + "loss": 2.864, + "theoretical_loss": 3.564360131434465, + "tokens_seen": 1287922688 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079739217652959, + "loss": 2.745, + "theoretical_loss": 3.5643436691695696, + "tokens_seen": 1287988224 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030796389167502507, + "loss": 2.7063, + "theoretical_loss": 3.5643272079768185, + "tokens_seen": 1288053760 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030795386158475425, + "loss": 2.9311, + "theoretical_loss": 3.5643107478560885, + "tokens_seen": 1288119296 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079438314944835, + "loss": 2.9115, + "theoretical_loss": 3.564294288807254, + "tokens_seen": 1288184832 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079338014042126, + "loss": 2.7866, + "theoretical_loss": 3.564277830830192, + "tokens_seen": 1288250368 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030792377131394185, + "loss": 3.032, + "theoretical_loss": 3.5642613739247766, + "tokens_seen": 1288315904 + }, + { + "epoch": 3.08, + "learning_rate": 0.000307913741223671, + "loss": 2.7858, + "theoretical_loss": 3.5642449180908846, + "tokens_seen": 1288381440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003079037111334002, + "loss": 2.7884, + "theoretical_loss": 3.564228463328392, + "tokens_seen": 1288446976 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078936810431294, + "loss": 2.8669, + "theoretical_loss": 3.5642120096371737, + "tokens_seen": 1288512512 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030788365095285857, + "loss": 2.6556, + "theoretical_loss": 3.564195557017106, + "tokens_seen": 1288578048 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030787362086258775, + "loss": 2.8063, + "theoretical_loss": 3.5641791054680643, + "tokens_seen": 1288643584 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030786359077231693, + "loss": 2.8815, + "theoretical_loss": 3.5641626549899246, + "tokens_seen": 1288709120 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078535606820461, + "loss": 2.7157, + "theoretical_loss": 3.564146205582563, + "tokens_seen": 1288774656 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030784353059177535, + "loss": 2.8328, + "theoretical_loss": 3.564129757245855, + "tokens_seen": 1288840192 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078335005015045, + "loss": 2.8642, + "theoretical_loss": 3.5641133099796765, + "tokens_seen": 1288905728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078234704112337, + "loss": 2.7168, + "theoretical_loss": 3.5640968637839037, + "tokens_seen": 1288971264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078134403209629, + "loss": 2.8251, + "theoretical_loss": 3.5640804186584125, + "tokens_seen": 1289036800 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003078034102306921, + "loss": 2.6499, + "theoretical_loss": 3.564063974603078, + "tokens_seen": 1289102336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003077933801404213, + "loss": 2.7232, + "theoretical_loss": 3.5640475316177778, + "tokens_seen": 1289167872 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030778335005015044, + "loss": 3.0154, + "theoretical_loss": 3.564031089702386, + "tokens_seen": 1289233408 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030777331995987967, + "loss": 2.8402, + "theoretical_loss": 3.56401464885678, + "tokens_seen": 1289298944 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030776328986960885, + "loss": 2.8579, + "theoretical_loss": 3.563998209080835, + "tokens_seen": 1289364480 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2063230, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8298683166503906, + "objective/train/theoretical_loss": 3.563994099303969, + "objective/train/tokens_used": 1309840864, + "theoretical_loss": 3.563994099303969, + "tokens_seen": 1289380864 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030775325977933803, + "loss": 2.8249, + "theoretical_loss": 3.563981770374428, + "tokens_seen": 1289430016 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003077432296890672, + "loss": 2.9201, + "theoretical_loss": 3.5639653327374337, + "tokens_seen": 1289495552 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003077331995987964, + "loss": 2.8597, + "theoretical_loss": 3.5639488961697294, + "tokens_seen": 1289561088 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003077231695085256, + "loss": 2.8466, + "theoretical_loss": 3.5639324606711904, + "tokens_seen": 1289626624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003077131394182548, + "loss": 2.7863, + "theoretical_loss": 3.5639160262416936, + "tokens_seen": 1289692160 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030770310932798394, + "loss": 2.7577, + "theoretical_loss": 3.563899592881114, + "tokens_seen": 1289757696 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076930792377132, + "loss": 2.7678, + "theoretical_loss": 3.563883160589329, + "tokens_seen": 1289823232 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076830491474423, + "loss": 2.8512, + "theoretical_loss": 3.5638667293662145, + "tokens_seen": 1289888768 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030767301905717154, + "loss": 2.8692, + "theoretical_loss": 3.563850299211646, + "tokens_seen": 1289954304 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076629889669007, + "loss": 2.7794, + "theoretical_loss": 3.5638338701255003, + "tokens_seen": 1290019840 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076529588766299, + "loss": 2.8115, + "theoretical_loss": 3.563817442107654, + "tokens_seen": 1290085376 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076429287863591, + "loss": 2.9429, + "theoretical_loss": 3.563801015157982, + "tokens_seen": 1290150912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076328986960883, + "loss": 2.7874, + "theoretical_loss": 3.563784589276362, + "tokens_seen": 1290216448 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030762286860581744, + "loss": 2.9317, + "theoretical_loss": 3.5637681644626698, + "tokens_seen": 1290281984 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076128385155467, + "loss": 2.8384, + "theoretical_loss": 3.563751740716782, + "tokens_seen": 1290347520 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003076028084252758, + "loss": 2.814, + "theoretical_loss": 3.5637353180385745, + "tokens_seen": 1290413056 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030759277833500504, + "loss": 2.8857, + "theoretical_loss": 3.5637188964279236, + "tokens_seen": 1290478592 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075827482447342, + "loss": 2.7727, + "theoretical_loss": 3.563702475884706, + "tokens_seen": 1290544128 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075727181544634, + "loss": 2.7112, + "theoretical_loss": 3.563686056408798, + "tokens_seen": 1290609664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075626880641926, + "loss": 2.8399, + "theoretical_loss": 3.5636696380000767, + "tokens_seen": 1290675200 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030755265797392177, + "loss": 2.7639, + "theoretical_loss": 3.5636532206584173, + "tokens_seen": 1290740736 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030754262788365095, + "loss": 2.7424, + "theoretical_loss": 3.5636368043836972, + "tokens_seen": 1290806272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075325977933802, + "loss": 2.7651, + "theoretical_loss": 3.5636203891757927, + "tokens_seen": 1290871808 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075225677031093, + "loss": 2.8601, + "theoretical_loss": 3.5636039750345807, + "tokens_seen": 1290937344 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030751253761283854, + "loss": 2.7055, + "theoretical_loss": 3.5635875619599364, + "tokens_seen": 1291002880 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2066144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.665174961090088, + "objective/train/theoretical_loss": 3.5635834588579125, + "objective/train/tokens_used": 1311479264, + "theoretical_loss": 3.5635834588579125, + "tokens_seen": 1291019264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003075025075225677, + "loss": 2.8417, + "theoretical_loss": 3.5635711499517377, + "tokens_seen": 1291068416 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074924774322969, + "loss": 2.758, + "theoretical_loss": 3.5635547390098608, + "tokens_seen": 1291133952 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074824473420261, + "loss": 2.7966, + "theoretical_loss": 3.5635383291341824, + "tokens_seen": 1291199488 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030747241725175527, + "loss": 2.7976, + "theoretical_loss": 3.5635219203245785, + "tokens_seen": 1291265024 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030746238716148445, + "loss": 2.8557, + "theoretical_loss": 3.5635055125809263, + "tokens_seen": 1291330560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074523570712137, + "loss": 2.9736, + "theoretical_loss": 3.5634891059031024, + "tokens_seen": 1291396096 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074423269809428, + "loss": 2.8324, + "theoretical_loss": 3.5634727002909834, + "tokens_seen": 1291461632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030743229689067205, + "loss": 2.6813, + "theoretical_loss": 3.5634562957444462, + "tokens_seen": 1291527168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074222668004012, + "loss": 2.7805, + "theoretical_loss": 3.563439892263367, + "tokens_seen": 1291592704 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074122367101304, + "loss": 2.8158, + "theoretical_loss": 3.5634234898476236, + "tokens_seen": 1291658240 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003074022066198596, + "loss": 2.8441, + "theoretical_loss": 3.5634070884970916, + "tokens_seen": 1291723776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030739217652958877, + "loss": 2.8269, + "theoretical_loss": 3.563390688211648, + "tokens_seen": 1291789312 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030738214643931795, + "loss": 2.7833, + "theoretical_loss": 3.56337428899117, + "tokens_seen": 1291854848 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030737211634904713, + "loss": 2.8598, + "theoretical_loss": 3.563357890835535, + "tokens_seen": 1291920384 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003073620862587763, + "loss": 2.8788, + "theoretical_loss": 3.563341493744618, + "tokens_seen": 1291985920 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030735205616850555, + "loss": 2.872, + "theoretical_loss": 3.5633250977182978, + "tokens_seen": 1292051456 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003073420260782347, + "loss": 2.8946, + "theoretical_loss": 3.5633087027564505, + "tokens_seen": 1292116992 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003073319959879639, + "loss": 2.8883, + "theoretical_loss": 3.5632923088589523, + "tokens_seen": 1292182528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003073219658976931, + "loss": 2.8475, + "theoretical_loss": 3.563275916025681, + "tokens_seen": 1292248064 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003073119358074223, + "loss": 2.7955, + "theoretical_loss": 3.5632595242565137, + "tokens_seen": 1292313600 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030730190571715146, + "loss": 2.8454, + "theoretical_loss": 3.563243133551327, + "tokens_seen": 1292379136 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030729187562688064, + "loss": 2.8474, + "theoretical_loss": 3.563226743909998, + "tokens_seen": 1292444672 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003072818455366098, + "loss": 2.8789, + "theoretical_loss": 3.5632103553324033, + "tokens_seen": 1292510208 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030727181544633905, + "loss": 2.8362, + "theoretical_loss": 3.56319396781842, + "tokens_seen": 1292575744 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003072617853560682, + "loss": 2.8707, + "theoretical_loss": 3.563177581367926, + "tokens_seen": 1292641280 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2068589, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.921787738800049, + "objective/train/theoretical_loss": 3.5631734849214585, + "objective/train/tokens_used": 1313117664, + "theoretical_loss": 3.5631734849214585, + "tokens_seen": 1292657664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003072517552657974, + "loss": 2.8481, + "theoretical_loss": 3.563161195980798, + "tokens_seen": 1292706816 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030724172517552654, + "loss": 2.9674, + "theoretical_loss": 3.5631448116569118, + "tokens_seen": 1292772352 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003072316950852558, + "loss": 2.9572, + "theoretical_loss": 3.563128428396146, + "tokens_seen": 1292837888 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030722166499498496, + "loss": 2.8161, + "theoretical_loss": 3.563112046198378, + "tokens_seen": 1292903424 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030721163490471414, + "loss": 2.9145, + "theoretical_loss": 3.5630956650634835, + "tokens_seen": 1292968960 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003072016048144433, + "loss": 2.7793, + "theoretical_loss": 3.563079284991341, + "tokens_seen": 1293034496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071915747241725, + "loss": 2.672, + "theoretical_loss": 3.563062905981827, + "tokens_seen": 1293100032 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071815446339017, + "loss": 2.801, + "theoretical_loss": 3.5630465280348185, + "tokens_seen": 1293165568 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071715145436309, + "loss": 2.615, + "theoretical_loss": 3.563030151150193, + "tokens_seen": 1293231104 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030716148445336005, + "loss": 3.0172, + "theoretical_loss": 3.5630137753278284, + "tokens_seen": 1293296640 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071514543630893, + "loss": 2.8771, + "theoretical_loss": 3.5629974005676006, + "tokens_seen": 1293362176 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030714142427281846, + "loss": 2.8382, + "theoretical_loss": 3.5629810268693882, + "tokens_seen": 1293427712 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030713139418254764, + "loss": 2.8414, + "theoretical_loss": 3.562964654233068, + "tokens_seen": 1293493248 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071213640922768, + "loss": 2.7694, + "theoretical_loss": 3.5629482826585175, + "tokens_seen": 1293558784 + }, + { + "epoch": 3.08, + "learning_rate": 0.000307111334002006, + "loss": 2.8717, + "theoretical_loss": 3.562931912145614, + "tokens_seen": 1293624320 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003071013039117352, + "loss": 2.7078, + "theoretical_loss": 3.562915542694234, + "tokens_seen": 1293689856 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070912738214644, + "loss": 2.7291, + "theoretical_loss": 3.562899174304256, + "tokens_seen": 1293755392 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030708124373119355, + "loss": 2.6744, + "theoretical_loss": 3.5628828069755576, + "tokens_seen": 1293820928 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070712136409228, + "loss": 2.799, + "theoretical_loss": 3.5628664407080155, + "tokens_seen": 1293886464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070611835506519, + "loss": 2.737, + "theoretical_loss": 3.5628500755015073, + "tokens_seen": 1293952000 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030705115346038115, + "loss": 2.7798, + "theoretical_loss": 3.5628337113559105, + "tokens_seen": 1294017536 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070411233701104, + "loss": 2.9475, + "theoretical_loss": 3.562817348271103, + "tokens_seen": 1294083072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070310932798395, + "loss": 2.8485, + "theoretical_loss": 3.562800986246962, + "tokens_seen": 1294148608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030702106318956874, + "loss": 2.82, + "theoretical_loss": 3.562784625283365, + "tokens_seen": 1294214144 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070110330992979, + "loss": 2.7271, + "theoretical_loss": 3.562768265380189, + "tokens_seen": 1294279680 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2071519, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1073174476623535, + "objective/train/theoretical_loss": 3.562764175570072, + "objective/train/tokens_used": 1314756064, + "theoretical_loss": 3.562764175570072, + "tokens_seen": 1294296064 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003070010030090271, + "loss": 2.9335, + "theoretical_loss": 3.5627519065373128, + "tokens_seen": 1294345216 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003069909729187563, + "loss": 2.7736, + "theoretical_loss": 3.5627355487546133, + "tokens_seen": 1294410752 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030698094282848547, + "loss": 2.79, + "theoretical_loss": 3.562719192031968, + "tokens_seen": 1294476288 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030697091273821465, + "loss": 2.9053, + "theoretical_loss": 3.5627028363692546, + "tokens_seen": 1294541824 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003069608826479439, + "loss": 2.8642, + "theoretical_loss": 3.5626864817663515, + "tokens_seen": 1294607360 + }, + { + "epoch": 3.08, + "learning_rate": 0.000306950852557673, + "loss": 2.6827, + "theoretical_loss": 3.562670128223135, + "tokens_seen": 1294672896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030694082246740225, + "loss": 2.8453, + "theoretical_loss": 3.5626537757394843, + "tokens_seen": 1294738432 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003069307923771314, + "loss": 2.8189, + "theoretical_loss": 3.562637424315276, + "tokens_seen": 1294803968 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003069207622868606, + "loss": 2.8279, + "theoretical_loss": 3.562621073950388, + "tokens_seen": 1294869504 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003069107321965898, + "loss": 2.8517, + "theoretical_loss": 3.562604724644699, + "tokens_seen": 1294935040 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030690070210631897, + "loss": 2.8635, + "theoretical_loss": 3.5625883763980855, + "tokens_seen": 1295000576 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030689067201604815, + "loss": 2.8402, + "theoretical_loss": 3.5625720292104264, + "tokens_seen": 1295066112 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030688064192577733, + "loss": 2.6234, + "theoretical_loss": 3.5625556830815985, + "tokens_seen": 1295131648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003068706118355065, + "loss": 2.8011, + "theoretical_loss": 3.5625393380114803, + "tokens_seen": 1295197184 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030686058174523575, + "loss": 2.7399, + "theoretical_loss": 3.56252299399995, + "tokens_seen": 1295262720 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003068505516549649, + "loss": 2.9758, + "theoretical_loss": 3.5625066510468844, + "tokens_seen": 1295328256 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003068405215646941, + "loss": 2.8831, + "theoretical_loss": 3.5624903091521625, + "tokens_seen": 1295393792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003068304914744233, + "loss": 2.8055, + "theoretical_loss": 3.562473968315661, + "tokens_seen": 1295459328 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003068204613841525, + "loss": 2.6627, + "theoretical_loss": 3.56245762853726, + "tokens_seen": 1295524864 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030681043129388166, + "loss": 2.8783, + "theoretical_loss": 3.5624412898168347, + "tokens_seen": 1295590400 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030680040120361084, + "loss": 2.8641, + "theoretical_loss": 3.562424952154265, + "tokens_seen": 1295655936 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030679037111334, + "loss": 2.8905, + "theoretical_loss": 3.562408615549428, + "tokens_seen": 1295721472 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030678034102306925, + "loss": 2.8393, + "theoretical_loss": 3.5623922800022028, + "tokens_seen": 1295787008 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003067703109327984, + "loss": 2.7541, + "theoretical_loss": 3.562375945512466, + "tokens_seen": 1295852544 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003067602808425276, + "loss": 2.752, + "theoretical_loss": 3.5623596120800967, + "tokens_seen": 1295918080 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2072917, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1015067100524902, + "objective/train/theoretical_loss": 3.5623555288872035, + "objective/train/tokens_used": 1316394464, + "theoretical_loss": 3.5623555288872035, + "tokens_seen": 1295934464 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030675025075225674, + "loss": 2.9707, + "theoretical_loss": 3.5623432797049723, + "tokens_seen": 1295983616 + }, + { + "epoch": 3.08, + "learning_rate": 0.000306740220661986, + "loss": 2.7515, + "theoretical_loss": 3.562326948386972, + "tokens_seen": 1296049152 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030673019057171516, + "loss": 2.8453, + "theoretical_loss": 3.5623106181259727, + "tokens_seen": 1296114688 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030672016048144434, + "loss": 2.8654, + "theoretical_loss": 3.562294288921853, + "tokens_seen": 1296180224 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003067101303911735, + "loss": 2.919, + "theoretical_loss": 3.5622779607744914, + "tokens_seen": 1296245760 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003067001003009027, + "loss": 2.8958, + "theoretical_loss": 3.562261633683766, + "tokens_seen": 1296311296 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003066900702106319, + "loss": 2.7682, + "theoretical_loss": 3.5622453076495546, + "tokens_seen": 1296376832 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003066800401203611, + "loss": 2.7638, + "theoretical_loss": 3.5622289826717353, + "tokens_seen": 1296442368 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030667001003009025, + "loss": 2.783, + "theoretical_loss": 3.5622126587501874, + "tokens_seen": 1296507904 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003066599799398195, + "loss": 3.0032, + "theoretical_loss": 3.5621963358847877, + "tokens_seen": 1296573440 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030664994984954866, + "loss": 2.776, + "theoretical_loss": 3.562180014075416, + "tokens_seen": 1296638976 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030663991975927784, + "loss": 2.9684, + "theoretical_loss": 3.562163693321949, + "tokens_seen": 1296704512 + }, + { + "epoch": 3.08, + "learning_rate": 0.000306629889669007, + "loss": 2.7708, + "theoretical_loss": 3.562147373624267, + "tokens_seen": 1296770048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003066198595787362, + "loss": 2.7872, + "theoretical_loss": 3.5621310549822462, + "tokens_seen": 1296835584 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003066098294884654, + "loss": 2.7892, + "theoretical_loss": 3.562114737395767, + "tokens_seen": 1296901120 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003065997993981946, + "loss": 2.7053, + "theoretical_loss": 3.5620984208647064, + "tokens_seen": 1296966656 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030658976930792375, + "loss": 2.8024, + "theoretical_loss": 3.562082105388943, + "tokens_seen": 1297032192 + }, + { + "epoch": 3.08, + "learning_rate": 0.000306579739217653, + "loss": 2.9303, + "theoretical_loss": 3.5620657909683553, + "tokens_seen": 1297097728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003065697091273821, + "loss": 2.8179, + "theoretical_loss": 3.562049477602822, + "tokens_seen": 1297163264 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030655967903711135, + "loss": 2.7255, + "theoretical_loss": 3.562033165292222, + "tokens_seen": 1297228800 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030654964894684053, + "loss": 2.8769, + "theoretical_loss": 3.562016854036433, + "tokens_seen": 1297294336 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003065396188565697, + "loss": 2.7975, + "theoretical_loss": 3.5620005438353335, + "tokens_seen": 1297359872 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003065295887662989, + "loss": 2.796, + "theoretical_loss": 3.561984234688803, + "tokens_seen": 1297425408 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003065195586760281, + "loss": 2.9704, + "theoretical_loss": 3.5619679265967186, + "tokens_seen": 1297490944 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030650952858575725, + "loss": 2.8285, + "theoretical_loss": 3.5619516195589602, + "tokens_seen": 1297556480 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2075639, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8028523921966553, + "objective/train/theoretical_loss": 3.561947542964244, + "objective/train/tokens_used": 1318032864, + "theoretical_loss": 3.561947542964244, + "tokens_seen": 1297572864 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064994984954865, + "loss": 2.7314, + "theoretical_loss": 3.5619353135754053, + "tokens_seen": 1297622016 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064894684052156, + "loss": 2.9098, + "theoretical_loss": 3.5619190086459334, + "tokens_seen": 1297687552 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030647943831494485, + "loss": 2.8228, + "theoretical_loss": 3.5619027047704224, + "tokens_seen": 1297753088 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030646940822467403, + "loss": 2.8124, + "theoretical_loss": 3.561886401948752, + "tokens_seen": 1297818624 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064593781344032, + "loss": 2.7555, + "theoretical_loss": 3.5618701001807995, + "tokens_seen": 1297884160 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064493480441324, + "loss": 3.0056, + "theoretical_loss": 3.5618537994664443, + "tokens_seen": 1297949696 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064393179538616, + "loss": 2.8435, + "theoretical_loss": 3.5618374998055655, + "tokens_seen": 1298015232 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030642928786359076, + "loss": 2.9488, + "theoretical_loss": 3.561821201198041, + "tokens_seen": 1298080768 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030641925777332, + "loss": 2.8276, + "theoretical_loss": 3.5618049036437505, + "tokens_seen": 1298146304 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003064092276830491, + "loss": 2.6892, + "theoretical_loss": 3.5617886071425717, + "tokens_seen": 1298211840 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030639919759277835, + "loss": 2.7033, + "theoretical_loss": 3.5617723116943845, + "tokens_seen": 1298277376 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063891675025075, + "loss": 2.9504, + "theoretical_loss": 3.561756017299066, + "tokens_seen": 1298342912 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063791374122367, + "loss": 2.799, + "theoretical_loss": 3.5617397239564976, + "tokens_seen": 1298408448 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063691073219659, + "loss": 2.7138, + "theoretical_loss": 3.561723431666556, + "tokens_seen": 1298473984 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063590772316951, + "loss": 2.9987, + "theoretical_loss": 3.561707140429121, + "tokens_seen": 1298539520 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030634904714142426, + "loss": 2.8682, + "theoretical_loss": 3.561690850244071, + "tokens_seen": 1298605056 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063390170511535, + "loss": 2.946, + "theoretical_loss": 3.561674561111286, + "tokens_seen": 1298670592 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003063289869608826, + "loss": 2.7838, + "theoretical_loss": 3.5616582730306434, + "tokens_seen": 1298736128 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030631895687061186, + "loss": 2.9839, + "theoretical_loss": 3.561641986002023, + "tokens_seen": 1298801664 + }, + { + "epoch": 3.08, + "learning_rate": 0.000306308926780341, + "loss": 2.7448, + "theoretical_loss": 3.5616257000253038, + "tokens_seen": 1298867200 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062988966900702, + "loss": 2.8656, + "theoretical_loss": 3.5616094151003646, + "tokens_seen": 1298932736 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030628886659979945, + "loss": 2.8582, + "theoretical_loss": 3.5615931312270845, + "tokens_seen": 1298998272 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062788365095286, + "loss": 2.8221, + "theoretical_loss": 3.561576848405343, + "tokens_seen": 1299063808 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062688064192578, + "loss": 2.8143, + "theoretical_loss": 3.561560566635018, + "tokens_seen": 1299129344 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030625877632898694, + "loss": 2.8654, + "theoretical_loss": 3.5615442859159896, + "tokens_seen": 1299194880 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2078608, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.149231195449829, + "objective/train/theoretical_loss": 3.561540215900483, + "objective/train/tokens_used": 1319671264, + "theoretical_loss": 3.561540215900483, + "tokens_seen": 1299211264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062487462387162, + "loss": 2.8877, + "theoretical_loss": 3.5615280062481363, + "tokens_seen": 1299260416 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030623871614844536, + "loss": 2.7977, + "theoretical_loss": 3.5615117276313377, + "tokens_seen": 1299325952 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030622868605817454, + "loss": 2.6923, + "theoretical_loss": 3.5614954500654727, + "tokens_seen": 1299391488 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062186559679037, + "loss": 2.7712, + "theoretical_loss": 3.56147917355042, + "tokens_seen": 1299457024 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003062086258776329, + "loss": 2.7567, + "theoretical_loss": 3.5614628980860594, + "tokens_seen": 1299522560 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061985957873621, + "loss": 2.7368, + "theoretical_loss": 3.5614466236722704, + "tokens_seen": 1299588096 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061885656970913, + "loss": 2.8776, + "theoretical_loss": 3.561430350308931, + "tokens_seen": 1299653632 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030617853560682045, + "loss": 2.8353, + "theoretical_loss": 3.561414077995922, + "tokens_seen": 1299719168 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061685055165497, + "loss": 2.6665, + "theoretical_loss": 3.5613978067331207, + "tokens_seen": 1299784704 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030615847542627886, + "loss": 2.79, + "theoretical_loss": 3.561381536520408, + "tokens_seen": 1299850240 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030614844533600804, + "loss": 2.8438, + "theoretical_loss": 3.5613652673576626, + "tokens_seen": 1299915776 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030614844533600804, + "loss": 2.9116, + "theoretical_loss": 3.561348999244764, + "tokens_seen": 1299981312 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061384152457372, + "loss": 2.728, + "theoretical_loss": 3.561332732181591, + "tokens_seen": 1300046848 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061283851554664, + "loss": 2.7514, + "theoretical_loss": 3.561316466168024, + "tokens_seen": 1300112384 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061183550651956, + "loss": 2.754, + "theoretical_loss": 3.5613002012039416, + "tokens_seen": 1300177920 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003061083249749248, + "loss": 2.8626, + "theoretical_loss": 3.561283937289223, + "tokens_seen": 1300243456 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030609829488465395, + "loss": 2.7788, + "theoretical_loss": 3.561267674423748, + "tokens_seen": 1300308992 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060882647943832, + "loss": 2.7701, + "theoretical_loss": 3.561251412607396, + "tokens_seen": 1300374528 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060782347041123, + "loss": 2.8661, + "theoretical_loss": 3.5612351518400467, + "tokens_seen": 1300440064 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030606820461384155, + "loss": 2.8729, + "theoretical_loss": 3.5612188921215786, + "tokens_seen": 1300505600 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030605817452357073, + "loss": 2.732, + "theoretical_loss": 3.561202633451873, + "tokens_seen": 1300571136 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060481444332999, + "loss": 2.8145, + "theoretical_loss": 3.5611863758308075, + "tokens_seen": 1300636672 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060381143430291, + "loss": 2.6258, + "theoretical_loss": 3.5611701192582625, + "tokens_seen": 1300702208 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060280842527583, + "loss": 2.7564, + "theoretical_loss": 3.5611538637341176, + "tokens_seen": 1300767744 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030601805416248745, + "loss": 2.8881, + "theoretical_loss": 3.5611376092582523, + "tokens_seen": 1300833280 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2081298, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7671711444854736, + "objective/train/theoretical_loss": 3.5611335458030653, + "objective/train/tokens_used": 1321309664, + "theoretical_loss": 3.5611335458030653, + "tokens_seen": 1300849664 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003060080240722167, + "loss": 2.957, + "theoretical_loss": 3.5611213558305463, + "tokens_seen": 1300898816 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059979939819458, + "loss": 2.8247, + "theoretical_loss": 3.5611051034508785, + "tokens_seen": 1300964352 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030598796389167505, + "loss": 2.7508, + "theoretical_loss": 3.5610888521191297, + "tokens_seen": 1301029888 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030597793380140423, + "loss": 2.9337, + "theoretical_loss": 3.5610726018351784, + "tokens_seen": 1301095424 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059679037111334, + "loss": 2.8224, + "theoretical_loss": 3.5610563525989054, + "tokens_seen": 1301160960 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059578736208626, + "loss": 2.7075, + "theoretical_loss": 3.561040104410189, + "tokens_seen": 1301226496 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059478435305918, + "loss": 2.8753, + "theoretical_loss": 3.5610238572689106, + "tokens_seen": 1301292032 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030593781344032096, + "loss": 2.9146, + "theoretical_loss": 3.561007611174948, + "tokens_seen": 1301357568 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059277833500502, + "loss": 2.9089, + "theoretical_loss": 3.560991366128183, + "tokens_seen": 1301423104 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003059177532597793, + "loss": 2.6995, + "theoretical_loss": 3.560975122128494, + "tokens_seen": 1301488640 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030590772316950855, + "loss": 2.8176, + "theoretical_loss": 3.560958879175761, + "tokens_seen": 1301554176 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058976930792377, + "loss": 2.8869, + "theoretical_loss": 3.5609426372698643, + "tokens_seen": 1301619712 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058876629889669, + "loss": 2.8934, + "theoretical_loss": 3.560926396410683, + "tokens_seen": 1301685248 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058776328986961, + "loss": 2.8092, + "theoretical_loss": 3.560910156598098, + "tokens_seen": 1301750784 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058676028084253, + "loss": 2.9269, + "theoretical_loss": 3.5608939178319883, + "tokens_seen": 1301816320 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030585757271815446, + "loss": 2.9519, + "theoretical_loss": 3.5608776801122333, + "tokens_seen": 1301881856 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058475426278837, + "loss": 2.9367, + "theoretical_loss": 3.5608614434387142, + "tokens_seen": 1301947392 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058375125376128, + "loss": 2.8458, + "theoretical_loss": 3.5608452078113104, + "tokens_seen": 1302012928 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030582748244734206, + "loss": 2.9139, + "theoretical_loss": 3.5608289732299014, + "tokens_seen": 1302078464 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058174523570712, + "loss": 2.8548, + "theoretical_loss": 3.560812739694368, + "tokens_seen": 1302144000 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003058074222668004, + "loss": 2.9906, + "theoretical_loss": 3.5607965072045893, + "tokens_seen": 1302209536 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057973921765296, + "loss": 2.9825, + "theoretical_loss": 3.5607802757604463, + "tokens_seen": 1302275072 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057873620862588, + "loss": 2.8054, + "theoretical_loss": 3.560764045361818, + "tokens_seen": 1302340608 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030577733199598796, + "loss": 2.776, + "theoretical_loss": 3.5607478160085853, + "tokens_seen": 1302406144 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030576730190571714, + "loss": 2.8602, + "theoretical_loss": 3.560731587700628, + "tokens_seen": 1302471680 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2083714, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0890378952026367, + "objective/train/theoretical_loss": 3.5607275307869486, + "objective/train/tokens_used": 1322948064, + "theoretical_loss": 3.5607275307869486, + "tokens_seen": 1302488064 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057572718154463, + "loss": 2.7941, + "theoretical_loss": 3.5607153604378254, + "tokens_seen": 1302537216 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030574724172517556, + "loss": 2.6113, + "theoretical_loss": 3.560699134220059, + "tokens_seen": 1302602752 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057372116349047, + "loss": 2.6274, + "theoretical_loss": 3.560682909047208, + "tokens_seen": 1302668288 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057271815446339, + "loss": 2.8497, + "theoretical_loss": 3.560666684919153, + "tokens_seen": 1302733824 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030571715145436305, + "loss": 2.8435, + "theoretical_loss": 3.560650461835774, + "tokens_seen": 1302799360 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003057071213640923, + "loss": 2.8352, + "theoretical_loss": 3.5606342397969506, + "tokens_seen": 1302864896 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030569709127382147, + "loss": 2.7981, + "theoretical_loss": 3.560618018802564, + "tokens_seen": 1302930432 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030568706118355065, + "loss": 2.9801, + "theoretical_loss": 3.5606017988524936, + "tokens_seen": 1302995968 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030567703109327983, + "loss": 2.689, + "theoretical_loss": 3.56058557994662, + "tokens_seen": 1303061504 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030566700100300906, + "loss": 2.7868, + "theoretical_loss": 3.5605693620848236, + "tokens_seen": 1303127040 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003056569709127382, + "loss": 2.8731, + "theoretical_loss": 3.5605531452669847, + "tokens_seen": 1303192576 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003056469408224674, + "loss": 2.7249, + "theoretical_loss": 3.560536929492983, + "tokens_seen": 1303258112 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030563691073219655, + "loss": 2.8185, + "theoretical_loss": 3.5605207147627, + "tokens_seen": 1303323648 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003056268806419258, + "loss": 2.9418, + "theoretical_loss": 3.5605045010760152, + "tokens_seen": 1303389184 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030561685055165497, + "loss": 2.7724, + "theoretical_loss": 3.5604882884328086, + "tokens_seen": 1303454720 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030560682046138415, + "loss": 2.7919, + "theoretical_loss": 3.560472076832961, + "tokens_seen": 1303520256 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030559679037111333, + "loss": 2.7925, + "theoretical_loss": 3.560455866276354, + "tokens_seen": 1303585792 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003055867602808425, + "loss": 2.8229, + "theoretical_loss": 3.560439656762866, + "tokens_seen": 1303651328 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003055767301905717, + "loss": 2.6363, + "theoretical_loss": 3.5604234482923784, + "tokens_seen": 1303716864 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030556670010030093, + "loss": 2.8463, + "theoretical_loss": 3.560407240864772, + "tokens_seen": 1303782400 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030555667001003006, + "loss": 2.9722, + "theoretical_loss": 3.5603910344799266, + "tokens_seen": 1303847936 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003055466399197593, + "loss": 2.7899, + "theoretical_loss": 3.560374829137723, + "tokens_seen": 1303913472 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003055366098294885, + "loss": 2.8613, + "theoretical_loss": 3.5603586248380417, + "tokens_seen": 1303979008 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030552657973921765, + "loss": 2.8385, + "theoretical_loss": 3.5603424215807635, + "tokens_seen": 1304044544 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003055165496489469, + "loss": 2.8349, + "theoretical_loss": 3.5603262193657685, + "tokens_seen": 1304110080 + }, + { + "epoch": 3.08, + "objective/train/docs_used": 2086380, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.472736120223999, + "objective/train/theoretical_loss": 3.5603221689748628, + "objective/train/tokens_used": 1324586464, + "theoretical_loss": 3.5603221689748628, + "tokens_seen": 1304126464 + }, + { + "epoch": 3.08, + "learning_rate": 0.000305506519558676, + "loss": 2.6381, + "theoretical_loss": 3.5603100181929372, + "tokens_seen": 1304175616 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030549648946840525, + "loss": 2.724, + "theoretical_loss": 3.560293818062151, + "tokens_seen": 1304241152 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030548645937813443, + "loss": 2.8502, + "theoretical_loss": 3.56027761897329, + "tokens_seen": 1304306688 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003054764292878636, + "loss": 2.8353, + "theoretical_loss": 3.560261420926235, + "tokens_seen": 1304372224 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003054663991975928, + "loss": 2.8943, + "theoretical_loss": 3.5602452239208664, + "tokens_seen": 1304437760 + }, + { + "epoch": 3.08, + "learning_rate": 0.000305456369107322, + "loss": 2.903, + "theoretical_loss": 3.5602290279570648, + "tokens_seen": 1304503296 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030544633901705116, + "loss": 2.843, + "theoretical_loss": 3.5602128330347114, + "tokens_seen": 1304568832 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003054363089267804, + "loss": 2.8759, + "theoretical_loss": 3.560196639153687, + "tokens_seen": 1304634368 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003054262788365095, + "loss": 2.8374, + "theoretical_loss": 3.560180446313871, + "tokens_seen": 1304699904 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030541624874623875, + "loss": 2.8442, + "theoretical_loss": 3.5601642545151453, + "tokens_seen": 1304765440 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003054062186559679, + "loss": 2.8615, + "theoretical_loss": 3.5601480637573912, + "tokens_seen": 1304830976 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053961885656971, + "loss": 2.798, + "theoretical_loss": 3.560131874040488, + "tokens_seen": 1304896512 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053861584754263, + "loss": 2.7265, + "theoretical_loss": 3.560115685364318, + "tokens_seen": 1304962048 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053761283851555, + "loss": 2.7766, + "theoretical_loss": 3.560099497728761, + "tokens_seen": 1305027584 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030536609829488466, + "loss": 2.6567, + "theoretical_loss": 3.5600833111336985, + "tokens_seen": 1305093120 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053560682046139, + "loss": 2.8244, + "theoretical_loss": 3.560067125579011, + "tokens_seen": 1305158656 + }, + { + "epoch": 3.08, + "learning_rate": 0.000305346038114343, + "loss": 2.8867, + "theoretical_loss": 3.560050941064579, + "tokens_seen": 1305224192 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030533600802407226, + "loss": 2.9066, + "theoretical_loss": 3.5600347575902846, + "tokens_seen": 1305289728 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053259779338014, + "loss": 2.6602, + "theoretical_loss": 3.5600185751560076, + "tokens_seen": 1305355264 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053159478435306, + "loss": 2.7313, + "theoretical_loss": 3.5600023937616294, + "tokens_seen": 1305420800 + }, + { + "epoch": 3.08, + "learning_rate": 0.0003053059177532598, + "loss": 2.8676, + "theoretical_loss": 3.559986213407031, + "tokens_seen": 1305486336 + }, + { + "epoch": 3.08, + "learning_rate": 0.000305295887662989, + "loss": 2.7769, + "theoretical_loss": 3.5599700340920934, + "tokens_seen": 1305551872 + }, + { + "epoch": 3.08, + "learning_rate": 0.00030528585757271816, + "loss": 2.889, + "theoretical_loss": 3.5599538558166977, + "tokens_seen": 1305617408 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030527582748244734, + "loss": 2.7681, + "theoretical_loss": 3.5599376785807246, + "tokens_seen": 1305682944 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003052657973921765, + "loss": 2.853, + "theoretical_loss": 3.559921502384056, + "tokens_seen": 1305748480 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2089277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.853698492050171, + "objective/train/theoretical_loss": 3.5599174584972655, + "objective/train/tokens_used": 1326224864, + "theoretical_loss": 3.5599174584972655, + "tokens_seen": 1305764864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030525576730190576, + "loss": 2.7721, + "theoretical_loss": 3.5599053272265717, + "tokens_seen": 1305814016 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003052457372116349, + "loss": 2.793, + "theoretical_loss": 3.5598891531081533, + "tokens_seen": 1305879552 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003052357071213641, + "loss": 2.6352, + "theoretical_loss": 3.5598729800286826, + "tokens_seen": 1305945088 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030522567703109325, + "loss": 2.8883, + "theoretical_loss": 3.5598568079880395, + "tokens_seen": 1306010624 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003052156469408225, + "loss": 2.8844, + "theoretical_loss": 3.559840636986106, + "tokens_seen": 1306076160 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030520561685055167, + "loss": 2.8389, + "theoretical_loss": 3.5598244670227635, + "tokens_seen": 1306141696 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030519558676028085, + "loss": 2.7417, + "theoretical_loss": 3.5598082980978925, + "tokens_seen": 1306207232 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030518555667001003, + "loss": 2.9572, + "theoretical_loss": 3.559792130211374, + "tokens_seen": 1306272768 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030517552657973926, + "loss": 2.8712, + "theoretical_loss": 3.5597759633630903, + "tokens_seen": 1306338304 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003051654964894684, + "loss": 2.8525, + "theoretical_loss": 3.559759797552922, + "tokens_seen": 1306403840 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003051554663991976, + "loss": 2.7293, + "theoretical_loss": 3.5597436327807506, + "tokens_seen": 1306469376 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030514543630892675, + "loss": 2.8364, + "theoretical_loss": 3.559727469046457, + "tokens_seen": 1306534912 + }, + { + "epoch": 3.09, + "learning_rate": 0.000305135406218656, + "loss": 2.9072, + "theoretical_loss": 3.5597113063499224, + "tokens_seen": 1306600448 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030512537612838517, + "loss": 2.8667, + "theoretical_loss": 3.559695144691029, + "tokens_seen": 1306665984 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030511534603811435, + "loss": 2.911, + "theoretical_loss": 3.559678984069657, + "tokens_seen": 1306731520 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030510531594784353, + "loss": 2.9653, + "theoretical_loss": 3.559662824485689, + "tokens_seen": 1306797056 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050952858575727, + "loss": 2.9226, + "theoretical_loss": 3.559646665939005, + "tokens_seen": 1306862592 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050852557673019, + "loss": 2.9666, + "theoretical_loss": 3.559630508429488, + "tokens_seen": 1306928128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030507522567703113, + "loss": 2.7287, + "theoretical_loss": 3.559614351957018, + "tokens_seen": 1306993664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030506519558676026, + "loss": 2.7716, + "theoretical_loss": 3.559598196521477, + "tokens_seen": 1307059200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050551654964895, + "loss": 2.7843, + "theoretical_loss": 3.559582042122747, + "tokens_seen": 1307124736 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050451354062186, + "loss": 2.7594, + "theoretical_loss": 3.5595658887607087, + "tokens_seen": 1307190272 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030503510531594785, + "loss": 2.7773, + "theoretical_loss": 3.559549736435243, + "tokens_seen": 1307255808 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030502507522567703, + "loss": 2.8944, + "theoretical_loss": 3.5595335851462337, + "tokens_seen": 1307321344 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050150451354062, + "loss": 2.8965, + "theoretical_loss": 3.55951743489356, + "tokens_seen": 1307386880 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2090640, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5629920959472656, + "objective/train/theoretical_loss": 3.5595133974923057, + "objective/train/tokens_used": 1327863264, + "theoretical_loss": 3.5595133974923057, + "tokens_seen": 1307403264 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003050050150451354, + "loss": 2.7947, + "theoretical_loss": 3.5595012856771047, + "tokens_seen": 1307452416 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030499498495486463, + "loss": 2.8572, + "theoretical_loss": 3.5594851374967487, + "tokens_seen": 1307517952 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030498495486459376, + "loss": 2.6986, + "theoretical_loss": 3.5594689903523746, + "tokens_seen": 1307583488 + }, + { + "epoch": 3.09, + "learning_rate": 0.000304974924774323, + "loss": 2.7664, + "theoretical_loss": 3.5594528442438627, + "tokens_seen": 1307649024 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003049648946840521, + "loss": 2.7047, + "theoretical_loss": 3.559436699171096, + "tokens_seen": 1307714560 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030495486459378136, + "loss": 2.7502, + "theoretical_loss": 3.559420555133955, + "tokens_seen": 1307780096 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030494483450351054, + "loss": 2.8966, + "theoretical_loss": 3.559404412132322, + "tokens_seen": 1307845632 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003049348044132397, + "loss": 2.8222, + "theoretical_loss": 3.5593882701660786, + "tokens_seen": 1307911168 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003049247743229689, + "loss": 2.7801, + "theoretical_loss": 3.5593721292351064, + "tokens_seen": 1307976704 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003049147442326981, + "loss": 2.9137, + "theoretical_loss": 3.559355989339287, + "tokens_seen": 1308042240 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030490471414242726, + "loss": 2.9152, + "theoretical_loss": 3.559339850478503, + "tokens_seen": 1308107776 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048946840521565, + "loss": 2.78, + "theoretical_loss": 3.5593237126526347, + "tokens_seen": 1308173312 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048846539618856, + "loss": 2.8774, + "theoretical_loss": 3.559307575861565, + "tokens_seen": 1308238848 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030487462387161486, + "loss": 2.7569, + "theoretical_loss": 3.559291440105176, + "tokens_seen": 1308304384 + }, + { + "epoch": 3.09, + "learning_rate": 0.000304864593781344, + "loss": 2.8994, + "theoretical_loss": 3.5592753053833484, + "tokens_seen": 1308369920 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048545636910732, + "loss": 2.8809, + "theoretical_loss": 3.559259171695965, + "tokens_seen": 1308435456 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048445336008024, + "loss": 2.737, + "theoretical_loss": 3.559243039042907, + "tokens_seen": 1308500992 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048345035105316, + "loss": 2.8818, + "theoretical_loss": 3.5592269074240566, + "tokens_seen": 1308566528 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030482447342026076, + "loss": 2.9159, + "theoretical_loss": 3.5592107768392953, + "tokens_seen": 1308632064 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030481444332999, + "loss": 2.7304, + "theoretical_loss": 3.559194647288506, + "tokens_seen": 1308697600 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003048044132397192, + "loss": 2.7646, + "theoretical_loss": 3.55917851877157, + "tokens_seen": 1308763136 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030479438314944836, + "loss": 2.8109, + "theoretical_loss": 3.5591623912883694, + "tokens_seen": 1308828672 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030478435305917754, + "loss": 2.8019, + "theoretical_loss": 3.5591462648387857, + "tokens_seen": 1308894208 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003047743229689067, + "loss": 2.8185, + "theoretical_loss": 3.5591301394227015, + "tokens_seen": 1308959744 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030476429287863596, + "loss": 2.7675, + "theoretical_loss": 3.5591140150399987, + "tokens_seen": 1309025280 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2093346, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9173524379730225, + "objective/train/theoretical_loss": 3.559109984105775, + "objective/train/tokens_used": 1329501664, + "theoretical_loss": 3.559109984105775, + "tokens_seen": 1309041664 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003047542627883651, + "loss": 2.9523, + "theoretical_loss": 3.5590978916905596, + "tokens_seen": 1309090816 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003047442326980943, + "loss": 2.8903, + "theoretical_loss": 3.5590817693742656, + "tokens_seen": 1309156352 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030473420260782345, + "loss": 2.873, + "theoretical_loss": 3.5590656480909995, + "tokens_seen": 1309221888 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003047241725175527, + "loss": 2.8857, + "theoretical_loss": 3.559049527840643, + "tokens_seen": 1309287424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030471414242728187, + "loss": 2.7579, + "theoretical_loss": 3.559033408623078, + "tokens_seen": 1309352960 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030470411233701105, + "loss": 2.9987, + "theoretical_loss": 3.559017290438187, + "tokens_seen": 1309418496 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030469408224674023, + "loss": 2.866, + "theoretical_loss": 3.559001173285852, + "tokens_seen": 1309484032 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030468405215646946, + "loss": 2.8507, + "theoretical_loss": 3.5589850571659554, + "tokens_seen": 1309549568 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003046740220661986, + "loss": 2.8989, + "theoretical_loss": 3.5589689420783794, + "tokens_seen": 1309615104 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003046639919759278, + "loss": 2.8002, + "theoretical_loss": 3.5589528280230063, + "tokens_seen": 1309680640 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030465396188565695, + "loss": 2.8096, + "theoretical_loss": 3.5589367149997173, + "tokens_seen": 1309746176 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003046439317953862, + "loss": 2.597, + "theoretical_loss": 3.558920603008396, + "tokens_seen": 1309811712 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030463390170511537, + "loss": 2.8211, + "theoretical_loss": 3.5589044920489243, + "tokens_seen": 1309877248 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030462387161484455, + "loss": 2.8166, + "theoretical_loss": 3.558888382121184, + "tokens_seen": 1309942784 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030461384152457373, + "loss": 2.8418, + "theoretical_loss": 3.558872273225058, + "tokens_seen": 1310008320 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003046038114343029, + "loss": 2.9632, + "theoretical_loss": 3.558856165360429, + "tokens_seen": 1310073856 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003045937813440321, + "loss": 2.6233, + "theoretical_loss": 3.5588400585271778, + "tokens_seen": 1310139392 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030458375125376133, + "loss": 2.776, + "theoretical_loss": 3.558823952725188, + "tokens_seen": 1310204928 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030457372116349046, + "loss": 2.9147, + "theoretical_loss": 3.558807847954342, + "tokens_seen": 1310270464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003045636910732197, + "loss": 3.0628, + "theoretical_loss": 3.558791744214522, + "tokens_seen": 1310336000 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003045536609829488, + "loss": 2.8744, + "theoretical_loss": 3.55877564150561, + "tokens_seen": 1310401536 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030454363089267805, + "loss": 3.0346, + "theoretical_loss": 3.558759539827489, + "tokens_seen": 1310467072 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030453360080240723, + "loss": 2.7699, + "theoretical_loss": 3.5587434391800414, + "tokens_seen": 1310532608 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003045235707121364, + "loss": 2.7961, + "theoretical_loss": 3.5587273395631493, + "tokens_seen": 1310598144 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003045135406218656, + "loss": 2.7241, + "theoretical_loss": 3.558711240976696, + "tokens_seen": 1310663680 + }, + { + "debugging/Self-BLEU-5": 0.6091227495457403, + "debugging/distinct-1-grams": 0.7620457493429281, + "debugging/distinct-2-grams": 0.9469750385948374, + "debugging/entropy-1-grams": 6.479892599711917, + "debugging/entropy-2-grams": 7.601295582494769, + "debugging/length": 520.5333333333333, + "debugging/num_segments": 30, + "epoch": 3.09, + "objective/train/docs_used": 2096158, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.00838041305542, + "objective/train/theoretical_loss": 3.558707216491075, + "objective/train/tokens_used": 1331140064, + "theoretical_loss": 3.558707216491075, + "tokens_seen": 1310680064 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030450351053159483, + "loss": 2.7861, + "theoretical_loss": 3.5586951434205636, + "tokens_seen": 1310729216 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030449348044132396, + "loss": 2.7585, + "theoretical_loss": 3.558679046894634, + "tokens_seen": 1310794752 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044834503510532, + "loss": 2.8584, + "theoretical_loss": 3.558662951398791, + "tokens_seen": 1310860288 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044734202607823, + "loss": 2.6333, + "theoretical_loss": 3.558646856932916, + "tokens_seen": 1310925824 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030446339017051156, + "loss": 2.7745, + "theoretical_loss": 3.5586307634968923, + "tokens_seen": 1310991360 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030445336008024074, + "loss": 2.8646, + "theoretical_loss": 3.558614671090602, + "tokens_seen": 1311056896 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044433299899699, + "loss": 2.6914, + "theoretical_loss": 3.558598579713929, + "tokens_seen": 1311122432 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044332998996991, + "loss": 2.9208, + "theoretical_loss": 3.5585824893667546, + "tokens_seen": 1311187968 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044232698094283, + "loss": 2.8271, + "theoretical_loss": 3.5585664000489614, + "tokens_seen": 1311253504 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030441323971915746, + "loss": 2.8154, + "theoretical_loss": 3.5585503117604333, + "tokens_seen": 1311319040 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003044032096288867, + "loss": 2.859, + "theoretical_loss": 3.5585342245010523, + "tokens_seen": 1311384576 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043931795386158, + "loss": 2.8052, + "theoretical_loss": 3.558518138270701, + "tokens_seen": 1311450112 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030438314944834506, + "loss": 2.8484, + "theoretical_loss": 3.5585020530692626, + "tokens_seen": 1311515648 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043731193580742, + "loss": 2.733, + "theoretical_loss": 3.55848596889662, + "tokens_seen": 1311581184 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043630892678034, + "loss": 2.7658, + "theoretical_loss": 3.558469885752655, + "tokens_seen": 1311646720 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043530591775326, + "loss": 2.9962, + "theoretical_loss": 3.558453803637251, + "tokens_seen": 1311712256 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043430290872618, + "loss": 2.7442, + "theoretical_loss": 3.5584377225502912, + "tokens_seen": 1311777792 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030433299899699097, + "loss": 2.7688, + "theoretical_loss": 3.5584216424916586, + "tokens_seen": 1311843328 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003043229689067202, + "loss": 2.8606, + "theoretical_loss": 3.5584055634612355, + "tokens_seen": 1311908864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030431293881644933, + "loss": 2.7325, + "theoretical_loss": 3.558389485458904, + "tokens_seen": 1311974400 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030430290872617856, + "loss": 2.9901, + "theoretical_loss": 3.558373408484549, + "tokens_seen": 1312039936 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003042928786359077, + "loss": 2.822, + "theoretical_loss": 3.558357332538052, + "tokens_seen": 1312105472 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003042828485456369, + "loss": 2.7404, + "theoretical_loss": 3.5583412576192965, + "tokens_seen": 1312171008 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003042728184553661, + "loss": 2.5848, + "theoretical_loss": 3.558325183728165, + "tokens_seen": 1312236544 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003042627883650953, + "loss": 2.8667, + "theoretical_loss": 3.558309110864541, + "tokens_seen": 1312302080 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2098971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5783674716949463, + "objective/train/theoretical_loss": 3.558305092809169, + "objective/train/tokens_used": 1332778464, + "theoretical_loss": 3.558305092809169, + "tokens_seen": 1312318464 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030425275827482447, + "loss": 2.7751, + "theoretical_loss": 3.558293039028307, + "tokens_seen": 1312367616 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030424272818455365, + "loss": 2.7733, + "theoretical_loss": 3.5582769682193467, + "tokens_seen": 1312433152 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030423269809428283, + "loss": 2.9529, + "theoretical_loss": 3.558260898437543, + "tokens_seen": 1312498688 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030422266800401207, + "loss": 2.8039, + "theoretical_loss": 3.5582448296827787, + "tokens_seen": 1312564224 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003042126379137412, + "loss": 2.9031, + "theoretical_loss": 3.558228761954936, + "tokens_seen": 1312629760 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030420260782347043, + "loss": 2.8049, + "theoretical_loss": 3.5582126952539, + "tokens_seen": 1312695296 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003041925777331996, + "loss": 2.8804, + "theoretical_loss": 3.558196629579552, + "tokens_seen": 1312760832 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003041825476429288, + "loss": 2.8954, + "theoretical_loss": 3.558180564931776, + "tokens_seen": 1312826368 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030417251755265797, + "loss": 2.7744, + "theoretical_loss": 3.558164501310456, + "tokens_seen": 1312891904 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030416248746238715, + "loss": 2.9904, + "theoretical_loss": 3.558148438715474, + "tokens_seen": 1312957440 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030415245737211633, + "loss": 2.9539, + "theoretical_loss": 3.5581323771467126, + "tokens_seen": 1313022976 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030414242728184557, + "loss": 2.7995, + "theoretical_loss": 3.558116316604056, + "tokens_seen": 1313088512 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003041323971915747, + "loss": 2.8015, + "theoretical_loss": 3.558100257087388, + "tokens_seen": 1313154048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030412236710130393, + "loss": 2.7951, + "theoretical_loss": 3.5580841985965908, + "tokens_seen": 1313219584 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030411233701103306, + "loss": 2.7978, + "theoretical_loss": 3.558068141131548, + "tokens_seen": 1313285120 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003041023069207623, + "loss": 2.832, + "theoretical_loss": 3.5580520846921426, + "tokens_seen": 1313350656 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003040922768304915, + "loss": 2.7334, + "theoretical_loss": 3.5580360292782585, + "tokens_seen": 1313416192 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030408224674022066, + "loss": 2.9385, + "theoretical_loss": 3.558019974889779, + "tokens_seen": 1313481728 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030407221664994984, + "loss": 2.8476, + "theoretical_loss": 3.558003921526587, + "tokens_seen": 1313547264 + }, + { + "epoch": 3.09, + "learning_rate": 0.000304062186559679, + "loss": 2.8456, + "theoretical_loss": 3.5579878691885662, + "tokens_seen": 1313612800 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030405215646940825, + "loss": 2.6621, + "theoretical_loss": 3.5579718178756004, + "tokens_seen": 1313678336 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030404212637913743, + "loss": 2.7341, + "theoretical_loss": 3.557955767587572, + "tokens_seen": 1313743872 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003040320962888666, + "loss": 2.9202, + "theoretical_loss": 3.5579397183243646, + "tokens_seen": 1313809408 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003040220661985958, + "loss": 2.8886, + "theoretical_loss": 3.5579236700858625, + "tokens_seen": 1313874944 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030401203610832503, + "loss": 2.5524, + "theoretical_loss": 3.557907622871949, + "tokens_seen": 1313940480 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2101853, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5543088912963867, + "objective/train/theoretical_loss": 3.5579036112285483, + "objective/train/tokens_used": 1334416864, + "theoretical_loss": 3.5579036112285483, + "tokens_seen": 1313956864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030400200601805416, + "loss": 2.7212, + "theoretical_loss": 3.5578915766825063, + "tokens_seen": 1314006016 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039919759277834, + "loss": 2.832, + "theoretical_loss": 3.5578755315174195, + "tokens_seen": 1314071552 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039819458375125, + "loss": 2.6889, + "theoretical_loss": 3.557859487376571, + "tokens_seen": 1314137088 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030397191574724176, + "loss": 2.7078, + "theoretical_loss": 3.5578434442598454, + "tokens_seen": 1314202624 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030396188565697094, + "loss": 2.9657, + "theoretical_loss": 3.5578274021671255, + "tokens_seen": 1314268160 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039518555667001, + "loss": 2.8741, + "theoretical_loss": 3.557811361098295, + "tokens_seen": 1314333696 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039418254764293, + "loss": 2.7808, + "theoretical_loss": 3.557795321053238, + "tokens_seen": 1314399232 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039317953861585, + "loss": 2.9634, + "theoretical_loss": 3.557779282031837, + "tokens_seen": 1314464768 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030392176529588766, + "loss": 2.7976, + "theoretical_loss": 3.5577632440339766, + "tokens_seen": 1314530304 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003039117352056169, + "loss": 2.8097, + "theoretical_loss": 3.55774720705954, + "tokens_seen": 1314595840 + }, + { + "epoch": 3.09, + "learning_rate": 0.000303901705115346, + "loss": 2.6227, + "theoretical_loss": 3.5577311711084114, + "tokens_seen": 1314661376 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030389167502507526, + "loss": 2.754, + "theoretical_loss": 3.557715136180474, + "tokens_seen": 1314726912 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003038816449348044, + "loss": 2.8084, + "theoretical_loss": 3.5576991022756115, + "tokens_seen": 1314792448 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003038716148445336, + "loss": 2.9275, + "theoretical_loss": 3.557683069393708, + "tokens_seen": 1314857984 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003038615847542628, + "loss": 2.8505, + "theoretical_loss": 3.557667037534647, + "tokens_seen": 1314923520 + }, + { + "epoch": 3.09, + "learning_rate": 0.000303851554663992, + "loss": 2.7893, + "theoretical_loss": 3.5576510066983125, + "tokens_seen": 1314989056 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030384152457372117, + "loss": 2.5944, + "theoretical_loss": 3.5576349768845885, + "tokens_seen": 1315054592 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003038314944834504, + "loss": 2.8817, + "theoretical_loss": 3.5576189480933573, + "tokens_seen": 1315120128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030382146439317953, + "loss": 2.9398, + "theoretical_loss": 3.5576029203245048, + "tokens_seen": 1315185664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030381143430290876, + "loss": 2.8184, + "theoretical_loss": 3.5575868935779136, + "tokens_seen": 1315251200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003038014042126379, + "loss": 2.8515, + "theoretical_loss": 3.5575708678534683, + "tokens_seen": 1315316736 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003037913741223671, + "loss": 2.9029, + "theoretical_loss": 3.5575548431510517, + "tokens_seen": 1315382272 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003037813440320963, + "loss": 2.8055, + "theoretical_loss": 3.557538819470549, + "tokens_seen": 1315447808 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003037713139418255, + "loss": 2.6782, + "theoretical_loss": 3.557522796811843, + "tokens_seen": 1315513344 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030376128385155467, + "loss": 2.8134, + "theoretical_loss": 3.5575067751748186, + "tokens_seen": 1315578880 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2104899, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9233715534210205, + "objective/train/theoretical_loss": 3.557502769925186, + "objective/train/tokens_used": 1336055264, + "theoretical_loss": 3.557502769925186, + "tokens_seen": 1315595264 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030375125376128385, + "loss": 3.0151, + "theoretical_loss": 3.5574907545593586, + "tokens_seen": 1315644416 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030374122367101303, + "loss": 2.832, + "theoretical_loss": 3.557474734965348, + "tokens_seen": 1315709952 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030373119358074227, + "loss": 2.7856, + "theoretical_loss": 3.557458716392671, + "tokens_seen": 1315775488 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003037211634904714, + "loss": 2.7675, + "theoretical_loss": 3.5574426988412107, + "tokens_seen": 1315841024 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030371113340020063, + "loss": 2.7909, + "theoretical_loss": 3.557426682310852, + "tokens_seen": 1315906560 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003037011033099298, + "loss": 2.7913, + "theoretical_loss": 3.557410666801478, + "tokens_seen": 1315972096 + }, + { + "epoch": 3.09, + "learning_rate": 0.000303691073219659, + "loss": 2.8998, + "theoretical_loss": 3.557394652312973, + "tokens_seen": 1316037632 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030368104312938817, + "loss": 2.8875, + "theoretical_loss": 3.557378638845222, + "tokens_seen": 1316103168 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030367101303911735, + "loss": 2.712, + "theoretical_loss": 3.557362626398108, + "tokens_seen": 1316168704 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030366098294884653, + "loss": 2.8339, + "theoretical_loss": 3.557346614971516, + "tokens_seen": 1316234240 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030365095285857577, + "loss": 2.8185, + "theoretical_loss": 3.5573306045653297, + "tokens_seen": 1316299776 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003036409227683049, + "loss": 2.8357, + "theoretical_loss": 3.5573145951794327, + "tokens_seen": 1316365312 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030363089267803413, + "loss": 2.687, + "theoretical_loss": 3.557298586813711, + "tokens_seen": 1316430848 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030362086258776326, + "loss": 2.7541, + "theoretical_loss": 3.5572825794680467, + "tokens_seen": 1316496384 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003036108324974925, + "loss": 2.736, + "theoretical_loss": 3.557266573142325, + "tokens_seen": 1316561920 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003036008024072217, + "loss": 2.9111, + "theoretical_loss": 3.5572505678364306, + "tokens_seen": 1316627456 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030359077231695086, + "loss": 2.9424, + "theoretical_loss": 3.557234563550247, + "tokens_seen": 1316692992 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030358074222668004, + "loss": 2.7584, + "theoretical_loss": 3.557218560283659, + "tokens_seen": 1316758528 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003035707121364092, + "loss": 2.9231, + "theoretical_loss": 3.5572025580365505, + "tokens_seen": 1316824064 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003035606820461384, + "loss": 2.8938, + "theoretical_loss": 3.557186556808806, + "tokens_seen": 1316889600 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030355065195586763, + "loss": 2.8521, + "theoretical_loss": 3.55717055660031, + "tokens_seen": 1316955136 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030354062186559676, + "loss": 2.8632, + "theoretical_loss": 3.5571545574109464, + "tokens_seen": 1317020672 + }, + { + "epoch": 3.09, + "learning_rate": 0.000303530591775326, + "loss": 2.8781, + "theoretical_loss": 3.5571385592405997, + "tokens_seen": 1317086208 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003035205616850552, + "loss": 2.7972, + "theoretical_loss": 3.5571225620891544, + "tokens_seen": 1317151744 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030351053159478436, + "loss": 2.8372, + "theoretical_loss": 3.557106565956496, + "tokens_seen": 1317217280 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2106316, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.423259735107422, + "objective/train/theoretical_loss": 3.5571025670825027, + "objective/train/tokens_used": 1337693664, + "theoretical_loss": 3.5571025670825027, + "tokens_seen": 1317233664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030350050150451354, + "loss": 2.7434, + "theoretical_loss": 3.557090570842507, + "tokens_seen": 1317282816 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034904714142427, + "loss": 2.7924, + "theoretical_loss": 3.557074576747073, + "tokens_seen": 1317348352 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034804413239719, + "loss": 2.8007, + "theoretical_loss": 3.557058583670078, + "tokens_seen": 1317413888 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030347041123370114, + "loss": 2.8749, + "theoretical_loss": 3.557042591611407, + "tokens_seen": 1317479424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030346038114343026, + "loss": 2.8607, + "theoretical_loss": 3.557026600570944, + "tokens_seen": 1317544960 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034503510531595, + "loss": 2.7511, + "theoretical_loss": 3.5570106105485744, + "tokens_seen": 1317610496 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034403209628886, + "loss": 2.8265, + "theoretical_loss": 3.5569946215441814, + "tokens_seen": 1317676032 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030343029087261786, + "loss": 2.9082, + "theoretical_loss": 3.5569786335576508, + "tokens_seen": 1317741568 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030342026078234704, + "loss": 2.9402, + "theoretical_loss": 3.5569626465888664, + "tokens_seen": 1317807104 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034102306920762, + "loss": 2.7586, + "theoretical_loss": 3.556946660637713, + "tokens_seen": 1317872640 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003034002006018054, + "loss": 2.9024, + "theoretical_loss": 3.5569306757040753, + "tokens_seen": 1317938176 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003033901705115346, + "loss": 2.8055, + "theoretical_loss": 3.556914691787838, + "tokens_seen": 1318003712 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030338014042126377, + "loss": 2.7862, + "theoretical_loss": 3.5568987088888857, + "tokens_seen": 1318069248 + }, + { + "epoch": 3.09, + "learning_rate": 0.000303370110330993, + "loss": 2.8009, + "theoretical_loss": 3.5568827270071033, + "tokens_seen": 1318134784 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030336008024072213, + "loss": 2.8366, + "theoretical_loss": 3.556866746142375, + "tokens_seen": 1318200320 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030335005015045137, + "loss": 2.7979, + "theoretical_loss": 3.556850766294586, + "tokens_seen": 1318265856 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030334002006018055, + "loss": 2.7936, + "theoretical_loss": 3.556834787463621, + "tokens_seen": 1318331392 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030332998996990973, + "loss": 2.8834, + "theoretical_loss": 3.5568188096493643, + "tokens_seen": 1318396928 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003033199598796389, + "loss": 2.7173, + "theoretical_loss": 3.5568028328517007, + "tokens_seen": 1318462464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003033099297893681, + "loss": 2.8458, + "theoretical_loss": 3.5567868570705157, + "tokens_seen": 1318528000 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003032998996990973, + "loss": 2.9084, + "theoretical_loss": 3.5567708823056936, + "tokens_seen": 1318593536 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003032898696088265, + "loss": 2.7781, + "theoretical_loss": 3.556754908557119, + "tokens_seen": 1318659072 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003032798395185557, + "loss": 2.8637, + "theoretical_loss": 3.5567389358246775, + "tokens_seen": 1318724608 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030326980942828487, + "loss": 2.6973, + "theoretical_loss": 3.556722964108253, + "tokens_seen": 1318790144 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030325977933801405, + "loss": 2.6848, + "theoretical_loss": 3.5567069934077313, + "tokens_seen": 1318855680 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2109209, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.809338092803955, + "objective/train/theoretical_loss": 3.5567030008913223, + "objective/train/tokens_used": 1339332064, + "theoretical_loss": 3.5567030008913223, + "tokens_seen": 1318872064 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030324974924774323, + "loss": 2.8748, + "theoretical_loss": 3.5566910237229967, + "tokens_seen": 1318921216 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030323971915747247, + "loss": 2.7958, + "theoretical_loss": 3.556675055053934, + "tokens_seen": 1318986752 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003032296890672016, + "loss": 2.8802, + "theoretical_loss": 3.556659087400429, + "tokens_seen": 1319052288 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030321965897693083, + "loss": 2.7857, + "theoretical_loss": 3.556643120762366, + "tokens_seen": 1319117824 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030320962888666, + "loss": 2.7623, + "theoretical_loss": 3.5566271551396307, + "tokens_seen": 1319183360 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003031995987963892, + "loss": 2.8826, + "theoretical_loss": 3.5566111905321067, + "tokens_seen": 1319248896 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030318956870611837, + "loss": 2.8475, + "theoretical_loss": 3.55659522693968, + "tokens_seen": 1319314432 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030317953861584755, + "loss": 2.9922, + "theoretical_loss": 3.5565792643622354, + "tokens_seen": 1319379968 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030316950852557673, + "loss": 2.711, + "theoretical_loss": 3.556563302799658, + "tokens_seen": 1319445504 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030315947843530597, + "loss": 2.7075, + "theoretical_loss": 3.556547342251833, + "tokens_seen": 1319511040 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003031494483450351, + "loss": 2.6819, + "theoretical_loss": 3.5565313827186458, + "tokens_seen": 1319576576 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030313941825476433, + "loss": 2.7439, + "theoretical_loss": 3.5565154241999806, + "tokens_seen": 1319642112 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030312938816449346, + "loss": 2.7433, + "theoretical_loss": 3.5564994666957235, + "tokens_seen": 1319707648 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003031193580742227, + "loss": 2.9033, + "theoretical_loss": 3.5564835102057586, + "tokens_seen": 1319773184 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003031093279839519, + "loss": 2.8782, + "theoretical_loss": 3.556467554729972, + "tokens_seen": 1319838720 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030309929789368106, + "loss": 2.8193, + "theoretical_loss": 3.5564516002682485, + "tokens_seen": 1319904256 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030308926780341024, + "loss": 2.9363, + "theoretical_loss": 3.556435646820473, + "tokens_seen": 1319969792 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003030792377131394, + "loss": 2.6923, + "theoretical_loss": 3.5564196943865314, + "tokens_seen": 1320035328 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003030692076228686, + "loss": 2.9306, + "theoretical_loss": 3.5564037429663085, + "tokens_seen": 1320100864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030305917753259783, + "loss": 2.8037, + "theoretical_loss": 3.5563877925596894, + "tokens_seen": 1320166400 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030304914744232696, + "loss": 2.7755, + "theoretical_loss": 3.55637184316656, + "tokens_seen": 1320231936 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003030391173520562, + "loss": 2.801, + "theoretical_loss": 3.5563558947868055, + "tokens_seen": 1320297472 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003030290872617854, + "loss": 2.6078, + "theoretical_loss": 3.5563399474203106, + "tokens_seen": 1320363008 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030301905717151456, + "loss": 2.9282, + "theoretical_loss": 3.556324001066961, + "tokens_seen": 1320428544 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030300902708124374, + "loss": 2.9702, + "theoretical_loss": 3.5563080557266415, + "tokens_seen": 1320494080 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2111515, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.835580587387085, + "objective/train/theoretical_loss": 3.5563040695498347, + "objective/train/tokens_used": 1340970464, + "theoretical_loss": 3.5563040695498347, + "tokens_seen": 1320510464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029989969909729, + "loss": 2.9063, + "theoretical_loss": 3.556292111399239, + "tokens_seen": 1320559616 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029889669007021, + "loss": 2.7503, + "theoretical_loss": 3.5562761680846373, + "tokens_seen": 1320625152 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030297893681043134, + "loss": 2.6869, + "theoretical_loss": 3.5562602257827227, + "tokens_seen": 1320690688 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030296890672016046, + "loss": 2.856, + "theoretical_loss": 3.5562442844933804, + "tokens_seen": 1320756224 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029588766298897, + "loss": 2.818, + "theoretical_loss": 3.5562283442164957, + "tokens_seen": 1320821760 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029488465396188, + "loss": 2.8848, + "theoretical_loss": 3.556212404951954, + "tokens_seen": 1320887296 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030293881644934806, + "loss": 2.8884, + "theoretical_loss": 3.5561964666996415, + "tokens_seen": 1320952832 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030292878635907724, + "loss": 2.6668, + "theoretical_loss": 3.5561805294594424, + "tokens_seen": 1321018368 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029187562688064, + "loss": 2.854, + "theoretical_loss": 3.5561645932312436, + "tokens_seen": 1321083904 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003029087261785356, + "loss": 2.904, + "theoretical_loss": 3.5561486580149295, + "tokens_seen": 1321149440 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003028986960882648, + "loss": 2.9453, + "theoretical_loss": 3.5561327238103866, + "tokens_seen": 1321214976 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030288866599799397, + "loss": 2.827, + "theoretical_loss": 3.5561167906174997, + "tokens_seen": 1321280512 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003028786359077232, + "loss": 2.915, + "theoretical_loss": 3.556100858436155, + "tokens_seen": 1321346048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030286860581745233, + "loss": 2.9076, + "theoretical_loss": 3.5560849272662374, + "tokens_seen": 1321411584 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030285857572718157, + "loss": 2.9217, + "theoretical_loss": 3.5560689971076336, + "tokens_seen": 1321477120 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030284854563691075, + "loss": 2.9083, + "theoretical_loss": 3.556053067960228, + "tokens_seen": 1321542656 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030283851554663993, + "loss": 2.8948, + "theoretical_loss": 3.556037139823908, + "tokens_seen": 1321608192 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003028284854563691, + "loss": 2.9581, + "theoretical_loss": 3.5560212126985573, + "tokens_seen": 1321673728 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003028184553660983, + "loss": 2.8005, + "theoretical_loss": 3.556005286584063, + "tokens_seen": 1321739264 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030280842527582747, + "loss": 2.6397, + "theoretical_loss": 3.55598936148031, + "tokens_seen": 1321804800 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003027983951855567, + "loss": 2.8178, + "theoretical_loss": 3.5559734373871845, + "tokens_seen": 1321870336 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030278836509528583, + "loss": 2.8169, + "theoretical_loss": 3.555957514304572, + "tokens_seen": 1321935872 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030277833500501507, + "loss": 2.8766, + "theoretical_loss": 3.555941592232359, + "tokens_seen": 1322001408 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003027683049147442, + "loss": 2.7585, + "theoretical_loss": 3.5559256711704306, + "tokens_seen": 1322066944 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030275827482447343, + "loss": 2.9548, + "theoretical_loss": 3.555909751118672, + "tokens_seen": 1322132480 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2114440, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8776073455810547, + "objective/train/theoretical_loss": 3.5559057712635584, + "objective/train/tokens_used": 1342608864, + "theoretical_loss": 3.5559057712635584, + "tokens_seen": 1322148864 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003027482447342026, + "loss": 2.8841, + "theoretical_loss": 3.55589383207697, + "tokens_seen": 1322198016 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003027382146439318, + "loss": 2.9489, + "theoretical_loss": 3.555877914045211, + "tokens_seen": 1322263552 + }, + { + "epoch": 3.09, + "learning_rate": 0.000302728184553661, + "loss": 2.9265, + "theoretical_loss": 3.5558619970232797, + "tokens_seen": 1322329088 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003027181544633902, + "loss": 2.8388, + "theoretical_loss": 3.5558460810110626, + "tokens_seen": 1322394624 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030270812437311934, + "loss": 2.7589, + "theoretical_loss": 3.555830166008445, + "tokens_seen": 1322460160 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030269809428284857, + "loss": 2.9722, + "theoretical_loss": 3.5558142520153133, + "tokens_seen": 1322525696 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026880641925777, + "loss": 2.8233, + "theoretical_loss": 3.555798339031554, + "tokens_seen": 1322591232 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030267803410230693, + "loss": 2.7813, + "theoretical_loss": 3.555782427057052, + "tokens_seen": 1322656768 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026680040120361, + "loss": 2.9629, + "theoretical_loss": 3.555766516091694, + "tokens_seen": 1322722304 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026579739217653, + "loss": 2.8082, + "theoretical_loss": 3.5557506061353656, + "tokens_seen": 1322787840 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026479438314945, + "loss": 2.5782, + "theoretical_loss": 3.5557346971879533, + "tokens_seen": 1322853376 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030263791374122366, + "loss": 2.8297, + "theoretical_loss": 3.555718789249343, + "tokens_seen": 1322918912 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030262788365095284, + "loss": 2.6506, + "theoretical_loss": 3.55570288231942, + "tokens_seen": 1322984448 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026178535606821, + "loss": 2.9381, + "theoretical_loss": 3.5556869763980714, + "tokens_seen": 1323049984 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003026078234704112, + "loss": 2.6774, + "theoretical_loss": 3.5556710714851825, + "tokens_seen": 1323115520 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030259779338014044, + "loss": 2.7193, + "theoretical_loss": 3.55565516758064, + "tokens_seen": 1323181056 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030258776328986956, + "loss": 2.842, + "theoretical_loss": 3.55563926468433, + "tokens_seen": 1323246592 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003025777331995988, + "loss": 2.7035, + "theoretical_loss": 3.5556233627961378, + "tokens_seen": 1323312128 + }, + { + "epoch": 3.09, + "learning_rate": 0.000302567703109328, + "loss": 2.8962, + "theoretical_loss": 3.555607461915951, + "tokens_seen": 1323377664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030255767301905716, + "loss": 2.8768, + "theoretical_loss": 3.555591562043655, + "tokens_seen": 1323443200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003025476429287864, + "loss": 2.868, + "theoretical_loss": 3.5555756631791358, + "tokens_seen": 1323508736 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003025376128385156, + "loss": 2.7676, + "theoretical_loss": 3.55555976532228, + "tokens_seen": 1323574272 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030252758274824476, + "loss": 2.8869, + "theoretical_loss": 3.5555438684729737, + "tokens_seen": 1323639808 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030251755265797394, + "loss": 2.7587, + "theoretical_loss": 3.555527972631103, + "tokens_seen": 1323705344 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003025075225677031, + "loss": 2.8617, + "theoretical_loss": 3.5555120777965543, + "tokens_seen": 1323770880 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2117216, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.947268009185791, + "objective/train/theoretical_loss": 3.5555081042452983, + "objective/train/tokens_used": 1344247264, + "theoretical_loss": 3.5555081042452983, + "tokens_seen": 1323787264 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003024974924774323, + "loss": 2.9292, + "theoretical_loss": 3.5554961839692147, + "tokens_seen": 1323836416 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030248746238716154, + "loss": 2.6374, + "theoretical_loss": 3.555480291148969, + "tokens_seen": 1323901952 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030247743229689066, + "loss": 2.8816, + "theoretical_loss": 3.5554643993357047, + "tokens_seen": 1323967488 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003024674022066199, + "loss": 2.8072, + "theoretical_loss": 3.5554485085293077, + "tokens_seen": 1324033024 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030245737211634903, + "loss": 2.8578, + "theoretical_loss": 3.5554326187296645, + "tokens_seen": 1324098560 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030244734202607826, + "loss": 2.8127, + "theoretical_loss": 3.5554167299366615, + "tokens_seen": 1324164096 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030243731193580744, + "loss": 2.8276, + "theoretical_loss": 3.5554008421501853, + "tokens_seen": 1324229632 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003024272818455366, + "loss": 2.9029, + "theoretical_loss": 3.555384955370122, + "tokens_seen": 1324295168 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003024172517552658, + "loss": 2.7823, + "theoretical_loss": 3.555369069596358, + "tokens_seen": 1324360704 + }, + { + "epoch": 3.09, + "learning_rate": 0.000302407221664995, + "loss": 2.8552, + "theoretical_loss": 3.5553531848287796, + "tokens_seen": 1324426240 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030239719157472417, + "loss": 2.6954, + "theoretical_loss": 3.555337301067274, + "tokens_seen": 1324491776 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003023871614844534, + "loss": 2.7602, + "theoretical_loss": 3.5553214183117268, + "tokens_seen": 1324557312 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030237713139418253, + "loss": 2.8304, + "theoretical_loss": 3.5553055365620256, + "tokens_seen": 1324622848 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030236710130391177, + "loss": 2.7648, + "theoretical_loss": 3.555289655818056, + "tokens_seen": 1324688384 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030235707121364095, + "loss": 2.8259, + "theoretical_loss": 3.5552737760797055, + "tokens_seen": 1324753920 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030234704112337013, + "loss": 2.6688, + "theoretical_loss": 3.5552578973468596, + "tokens_seen": 1324819456 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003023370110330993, + "loss": 2.869, + "theoretical_loss": 3.5552420196194054, + "tokens_seen": 1324884992 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003023269809428285, + "loss": 2.8898, + "theoretical_loss": 3.5552261428972294, + "tokens_seen": 1324950528 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030231695085255767, + "loss": 2.9753, + "theoretical_loss": 3.555210267180219, + "tokens_seen": 1325016064 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003023069207622869, + "loss": 2.9156, + "theoretical_loss": 3.55519439246826, + "tokens_seen": 1325081600 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030229689067201603, + "loss": 2.7793, + "theoretical_loss": 3.555178518761239, + "tokens_seen": 1325147136 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030228686058174527, + "loss": 2.9431, + "theoretical_loss": 3.555162646059043, + "tokens_seen": 1325212672 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003022768304914744, + "loss": 3.0071, + "theoretical_loss": 3.5551467743615586, + "tokens_seen": 1325278208 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030226680040120363, + "loss": 2.9369, + "theoretical_loss": 3.5551309036686725, + "tokens_seen": 1325343744 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003022567703109328, + "loss": 2.8655, + "theoretical_loss": 3.5551150339802717, + "tokens_seen": 1325409280 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2119959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.639852285385132, + "objective/train/theoretical_loss": 3.555111066715109, + "objective/train/tokens_used": 1345885664, + "theoretical_loss": 3.555111066715109, + "tokens_seen": 1325425664 + }, + { + "epoch": 3.09, + "learning_rate": 0.000302246740220662, + "loss": 2.8135, + "theoretical_loss": 3.5550991652962427, + "tokens_seen": 1325474816 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003022367101303912, + "loss": 2.8102, + "theoretical_loss": 3.5550832976164726, + "tokens_seen": 1325540352 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003022266800401204, + "loss": 2.783, + "theoretical_loss": 3.5550674309408477, + "tokens_seen": 1325605888 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030221664994984954, + "loss": 2.829, + "theoretical_loss": 3.555051565269255, + "tokens_seen": 1325671424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030220661985957877, + "loss": 2.8398, + "theoretical_loss": 3.5550357006015822, + "tokens_seen": 1325736960 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021965897693079, + "loss": 2.7614, + "theoretical_loss": 3.5550198369377144, + "tokens_seen": 1325802496 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030218655967903713, + "loss": 2.9223, + "theoretical_loss": 3.5550039742775397, + "tokens_seen": 1325868032 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021765295887663, + "loss": 2.8765, + "theoretical_loss": 3.5549881126209453, + "tokens_seen": 1325933568 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021664994984955, + "loss": 2.8459, + "theoretical_loss": 3.5549722519678166, + "tokens_seen": 1325999104 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021564694082247, + "loss": 2.8285, + "theoretical_loss": 3.5549563923180423, + "tokens_seen": 1326064640 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030214643931795386, + "loss": 2.8376, + "theoretical_loss": 3.554940533671508, + "tokens_seen": 1326130176 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030213640922768304, + "loss": 2.9023, + "theoretical_loss": 3.5549246760281017, + "tokens_seen": 1326195712 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021263791374123, + "loss": 2.8785, + "theoretical_loss": 3.5549088193877094, + "tokens_seen": 1326261248 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003021163490471414, + "loss": 2.7295, + "theoretical_loss": 3.554892963750219, + "tokens_seen": 1326326784 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030210631895687064, + "loss": 2.9109, + "theoretical_loss": 3.5548771091155165, + "tokens_seen": 1326392320 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030209628886659976, + "loss": 2.7594, + "theoretical_loss": 3.5548612554834897, + "tokens_seen": 1326457856 + }, + { + "epoch": 3.09, + "learning_rate": 0.000302086258776329, + "loss": 2.7887, + "theoretical_loss": 3.5548454028540255, + "tokens_seen": 1326523392 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003020762286860582, + "loss": 2.6845, + "theoretical_loss": 3.554829551227011, + "tokens_seen": 1326588928 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030206619859578736, + "loss": 2.8145, + "theoretical_loss": 3.5548137006023333, + "tokens_seen": 1326654464 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030205616850551654, + "loss": 2.7328, + "theoretical_loss": 3.554797850979879, + "tokens_seen": 1326720000 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003020461384152458, + "loss": 2.8265, + "theoretical_loss": 3.5547820023595365, + "tokens_seen": 1326785536 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003020361083249749, + "loss": 2.8137, + "theoretical_loss": 3.5547661547411917, + "tokens_seen": 1326851072 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030202607823470414, + "loss": 2.8541, + "theoretical_loss": 3.5547503081247322, + "tokens_seen": 1326916608 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030201604814443327, + "loss": 2.8666, + "theoretical_loss": 3.5547344625100448, + "tokens_seen": 1326982144 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003020060180541625, + "loss": 2.9462, + "theoretical_loss": 3.5547186178970174, + "tokens_seen": 1327047680 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2122481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8875608444213867, + "objective/train/theoretical_loss": 3.5547146569002566, + "objective/train/tokens_used": 1347524064, + "theoretical_loss": 3.5547146569002566, + "tokens_seen": 1327064064 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003019959879638917, + "loss": 2.8068, + "theoretical_loss": 3.554702774285537, + "tokens_seen": 1327113216 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030198595787362087, + "loss": 2.9838, + "theoretical_loss": 3.5546869316754903, + "tokens_seen": 1327178752 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030197592778335005, + "loss": 2.9922, + "theoretical_loss": 3.5546710900667655, + "tokens_seen": 1327244288 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030196589769307923, + "loss": 2.9269, + "theoretical_loss": 3.554655249459249, + "tokens_seen": 1327309824 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003019558676028084, + "loss": 2.8173, + "theoretical_loss": 3.5546394098528284, + "tokens_seen": 1327375360 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030194583751253764, + "loss": 2.681, + "theoretical_loss": 3.5546235712473915, + "tokens_seen": 1327440896 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030193580742226677, + "loss": 2.7748, + "theoretical_loss": 3.5546077336428246, + "tokens_seen": 1327506432 + }, + { + "epoch": 3.09, + "learning_rate": 0.000301925777331996, + "loss": 2.7427, + "theoretical_loss": 3.554591897039016, + "tokens_seen": 1327571968 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030191574724172513, + "loss": 2.7819, + "theoretical_loss": 3.554576061435853, + "tokens_seen": 1327637504 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030190571715145437, + "loss": 2.8072, + "theoretical_loss": 3.554560226833222, + "tokens_seen": 1327703040 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030189568706118355, + "loss": 2.7145, + "theoretical_loss": 3.5545443932310112, + "tokens_seen": 1327768576 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030188565697091273, + "loss": 2.6952, + "theoretical_loss": 3.554528560629108, + "tokens_seen": 1327834112 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003018756268806419, + "loss": 2.8961, + "theoretical_loss": 3.5545127290274, + "tokens_seen": 1327899648 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030186559679037115, + "loss": 2.6927, + "theoretical_loss": 3.554496898425774, + "tokens_seen": 1327965184 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003018555667001003, + "loss": 2.8422, + "theoretical_loss": 3.554481068824118, + "tokens_seen": 1328030720 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003018455366098295, + "loss": 2.8802, + "theoretical_loss": 3.55446524022232, + "tokens_seen": 1328096256 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030183550651955864, + "loss": 2.9006, + "theoretical_loss": 3.554449412620266, + "tokens_seen": 1328161792 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030182547642928787, + "loss": 2.8672, + "theoretical_loss": 3.5544335860178444, + "tokens_seen": 1328227328 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030181544633901705, + "loss": 2.7994, + "theoretical_loss": 3.5544177604149434, + "tokens_seen": 1328292864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030180541624874623, + "loss": 2.7864, + "theoretical_loss": 3.5544019358114496, + "tokens_seen": 1328358400 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030179538615847547, + "loss": 2.7827, + "theoretical_loss": 3.5543861122072506, + "tokens_seen": 1328423936 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003017853560682046, + "loss": 2.7919, + "theoretical_loss": 3.5543702896022347, + "tokens_seen": 1328489472 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030177532597793383, + "loss": 2.8744, + "theoretical_loss": 3.554354467996289, + "tokens_seen": 1328555008 + }, + { + "epoch": 3.09, + "learning_rate": 0.000301765295887663, + "loss": 2.7533, + "theoretical_loss": 3.5543386473893013, + "tokens_seen": 1328620544 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003017552657973922, + "loss": 2.9037, + "theoretical_loss": 3.554322827781159, + "tokens_seen": 1328686080 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2123978, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9501872062683105, + "objective/train/theoretical_loss": 3.5543188730351805, + "objective/train/tokens_used": 1349162464, + "theoretical_loss": 3.5543188730351805, + "tokens_seen": 1328702464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003017452357071214, + "loss": 2.7567, + "theoretical_loss": 3.5543070091717506, + "tokens_seen": 1328751616 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003017352056168506, + "loss": 2.7657, + "theoretical_loss": 3.5542911915609627, + "tokens_seen": 1328817152 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030172517552657974, + "loss": 2.797, + "theoretical_loss": 3.5542753749486833, + "tokens_seen": 1328882688 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030171514543630897, + "loss": 2.6882, + "theoretical_loss": 3.5542595593348008, + "tokens_seen": 1328948224 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003017051153460381, + "loss": 2.784, + "theoretical_loss": 3.5542437447192023, + "tokens_seen": 1329013760 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030169508525576733, + "loss": 2.8476, + "theoretical_loss": 3.5542279311017757, + "tokens_seen": 1329079296 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003016850551654965, + "loss": 2.8633, + "theoretical_loss": 3.5542121184824085, + "tokens_seen": 1329144832 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003016750250752257, + "loss": 2.8218, + "theoretical_loss": 3.554196306860989, + "tokens_seen": 1329210368 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003016649949849549, + "loss": 2.8157, + "theoretical_loss": 3.5541804962374055, + "tokens_seen": 1329275904 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030165496489468406, + "loss": 2.8494, + "theoretical_loss": 3.5541646866115446, + "tokens_seen": 1329341440 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030164493480441324, + "loss": 2.7302, + "theoretical_loss": 3.5541488779832946, + "tokens_seen": 1329406976 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003016349047141425, + "loss": 2.8744, + "theoretical_loss": 3.554133070352544, + "tokens_seen": 1329472512 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003016248746238716, + "loss": 2.7465, + "theoretical_loss": 3.55411726371918, + "tokens_seen": 1329538048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030161484453360084, + "loss": 2.7185, + "theoretical_loss": 3.5541014580830907, + "tokens_seen": 1329603584 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030160481444332996, + "loss": 2.9361, + "theoretical_loss": 3.554085653444164, + "tokens_seen": 1329669120 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003015947843530592, + "loss": 2.8597, + "theoretical_loss": 3.554069849802288, + "tokens_seen": 1329734656 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003015847542627884, + "loss": 2.9195, + "theoretical_loss": 3.5540540471573507, + "tokens_seen": 1329800192 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030157472417251756, + "loss": 2.6951, + "theoretical_loss": 3.554038245509239, + "tokens_seen": 1329865728 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030156469408224674, + "loss": 2.6549, + "theoretical_loss": 3.554022444857843, + "tokens_seen": 1329931264 + }, + { + "epoch": 3.09, + "learning_rate": 0.000301554663991976, + "loss": 2.7233, + "theoretical_loss": 3.5540066452030494, + "tokens_seen": 1329996800 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003015446339017051, + "loss": 2.7648, + "theoretical_loss": 3.553990846544746, + "tokens_seen": 1330062336 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030153460381143434, + "loss": 2.9068, + "theoretical_loss": 3.5539750488828212, + "tokens_seen": 1330127872 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030152457372116347, + "loss": 2.8966, + "theoretical_loss": 3.5539592522171635, + "tokens_seen": 1330193408 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003015145436308927, + "loss": 2.915, + "theoretical_loss": 3.5539434565476604, + "tokens_seen": 1330258944 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003015045135406219, + "loss": 2.9695, + "theoretical_loss": 3.5539276618742, + "tokens_seen": 1330324480 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2126663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.497631788253784, + "objective/train/theoretical_loss": 3.5539237133614536, + "objective/train/tokens_used": 1350800864, + "theoretical_loss": 3.5539237133614536, + "tokens_seen": 1330340864 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030149448345035107, + "loss": 2.7926, + "theoretical_loss": 3.5539118681966713, + "tokens_seen": 1330390016 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030148445336008025, + "loss": 2.6273, + "theoretical_loss": 3.5538960755149613, + "tokens_seen": 1330455552 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030147442326980943, + "loss": 2.8386, + "theoretical_loss": 3.5538802838289585, + "tokens_seen": 1330521088 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003014643931795386, + "loss": 2.6931, + "theoretical_loss": 3.5538644931385517, + "tokens_seen": 1330586624 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030145436308926784, + "loss": 2.7836, + "theoretical_loss": 3.5538487034436286, + "tokens_seen": 1330652160 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030144433299899697, + "loss": 2.7772, + "theoretical_loss": 3.5538329147440773, + "tokens_seen": 1330717696 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003014343029087262, + "loss": 2.9266, + "theoretical_loss": 3.5538171270397863, + "tokens_seen": 1330783232 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030142427281845533, + "loss": 2.7835, + "theoretical_loss": 3.5538013403306437, + "tokens_seen": 1330848768 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030141424272818457, + "loss": 2.8441, + "theoretical_loss": 3.553785554616538, + "tokens_seen": 1330914304 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030140421263791375, + "loss": 2.8551, + "theoretical_loss": 3.5537697698973574, + "tokens_seen": 1330979840 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030139418254764293, + "loss": 2.8666, + "theoretical_loss": 3.55375398617299, + "tokens_seen": 1331045376 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003013841524573721, + "loss": 2.8992, + "theoretical_loss": 3.5537382034433236, + "tokens_seen": 1331110912 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030137412236710135, + "loss": 2.8663, + "theoretical_loss": 3.553722421708248, + "tokens_seen": 1331176448 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003013640922768305, + "loss": 2.9847, + "theoretical_loss": 3.5537066409676505, + "tokens_seen": 1331241984 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003013540621865597, + "loss": 2.8288, + "theoretical_loss": 3.55369086122142, + "tokens_seen": 1331307520 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030134403209628884, + "loss": 2.7466, + "theoretical_loss": 3.5536750824694447, + "tokens_seen": 1331373056 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030133400200601807, + "loss": 2.7806, + "theoretical_loss": 3.553659304711612, + "tokens_seen": 1331438592 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030132397191574725, + "loss": 2.8156, + "theoretical_loss": 3.5536435279478127, + "tokens_seen": 1331504128 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030131394182547643, + "loss": 2.8743, + "theoretical_loss": 3.553627752177933, + "tokens_seen": 1331569664 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003013039117352056, + "loss": 2.9136, + "theoretical_loss": 3.5536119774018626, + "tokens_seen": 1331635200 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012938816449348, + "loss": 2.9487, + "theoretical_loss": 3.553596203619489, + "tokens_seen": 1331700736 + }, + { + "epoch": 3.09, + "learning_rate": 0.000301283851554664, + "loss": 2.9056, + "theoretical_loss": 3.553580430830702, + "tokens_seen": 1331766272 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012738214643932, + "loss": 2.8456, + "theoretical_loss": 3.553564659035389, + "tokens_seen": 1331831808 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030126379137412234, + "loss": 2.8106, + "theoretical_loss": 3.553548888233439, + "tokens_seen": 1331897344 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012537612838516, + "loss": 2.7353, + "theoretical_loss": 3.553533118424741, + "tokens_seen": 1331962880 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2129395, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.772055149078369, + "objective/train/theoretical_loss": 3.553529176127749, + "objective/train/tokens_used": 1352439264, + "theoretical_loss": 3.553529176127749, + "tokens_seen": 1331979264 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012437311935807, + "loss": 2.9232, + "theoretical_loss": 3.5535173496091828, + "tokens_seen": 1332028416 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030123370110330994, + "loss": 2.6837, + "theoretical_loss": 3.5535015817866533, + "tokens_seen": 1332093952 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012236710130391, + "loss": 2.9202, + "theoretical_loss": 3.5534858149570407, + "tokens_seen": 1332159488 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012136409227683, + "loss": 2.7433, + "theoretical_loss": 3.553470049120235, + "tokens_seen": 1332225024 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003012036108324975, + "loss": 2.7255, + "theoretical_loss": 3.553454284276123, + "tokens_seen": 1332290560 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003011935807422267, + "loss": 2.8296, + "theoretical_loss": 3.553438520424595, + "tokens_seen": 1332356096 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030118355065195584, + "loss": 2.7407, + "theoretical_loss": 3.5534227575655386, + "tokens_seen": 1332421632 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003011735205616851, + "loss": 2.942, + "theoretical_loss": 3.553406995698843, + "tokens_seen": 1332487168 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003011634904714142, + "loss": 2.8034, + "theoretical_loss": 3.553391234824397, + "tokens_seen": 1332552704 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030115346038114344, + "loss": 2.6784, + "theoretical_loss": 3.553375474942089, + "tokens_seen": 1332618240 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003011434302908726, + "loss": 2.9225, + "theoretical_loss": 3.5533597160518076, + "tokens_seen": 1332683776 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003011334002006018, + "loss": 2.7962, + "theoretical_loss": 3.5533439581534423, + "tokens_seen": 1332749312 + }, + { + "epoch": 3.09, + "learning_rate": 0.000301123370110331, + "loss": 2.7489, + "theoretical_loss": 3.553328201246881, + "tokens_seen": 1332814848 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030111334002006016, + "loss": 2.7929, + "theoretical_loss": 3.5533124453320135, + "tokens_seen": 1332880384 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030110330992978935, + "loss": 2.9275, + "theoretical_loss": 3.553296690408728, + "tokens_seen": 1332945920 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003010932798395186, + "loss": 2.8396, + "theoretical_loss": 3.553280936476913, + "tokens_seen": 1333011456 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003010832497492477, + "loss": 2.7383, + "theoretical_loss": 3.5532651835364586, + "tokens_seen": 1333076992 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030107321965897694, + "loss": 2.7909, + "theoretical_loss": 3.5532494315872523, + "tokens_seen": 1333142528 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030106318956870607, + "loss": 2.6636, + "theoretical_loss": 3.5532336806291838, + "tokens_seen": 1333208064 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003010531594784353, + "loss": 2.6731, + "theoretical_loss": 3.5532179306621416, + "tokens_seen": 1333273600 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030104312938816454, + "loss": 2.8224, + "theoretical_loss": 3.5532021816860153, + "tokens_seen": 1333339136 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030103309929789367, + "loss": 2.8531, + "theoretical_loss": 3.5531864337006933, + "tokens_seen": 1333404672 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003010230692076229, + "loss": 2.8243, + "theoretical_loss": 3.553170686706064, + "tokens_seen": 1333470208 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003010130391173521, + "loss": 2.8879, + "theoretical_loss": 3.553154940702018, + "tokens_seen": 1333535744 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030100300902708127, + "loss": 2.9081, + "theoretical_loss": 3.553139195688443, + "tokens_seen": 1333601280 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2132198, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.324906587600708, + "objective/train/theoretical_loss": 3.553135259589798, + "objective/train/tokens_used": 1354077664, + "theoretical_loss": 3.553135259589798, + "tokens_seen": 1333617664 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030099297893681045, + "loss": 2.7262, + "theoretical_loss": 3.5531234516652286, + "tokens_seen": 1333666816 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030098294884653963, + "loss": 2.8888, + "theoretical_loss": 3.5531077086322638, + "tokens_seen": 1333732352 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003009729187562688, + "loss": 2.8172, + "theoretical_loss": 3.553091966589437, + "tokens_seen": 1333797888 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030096288866599804, + "loss": 2.837, + "theoretical_loss": 3.553076225536638, + "tokens_seen": 1333863424 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030095285857572717, + "loss": 2.844, + "theoretical_loss": 3.5530604854737557, + "tokens_seen": 1333928960 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003009428284854564, + "loss": 2.8676, + "theoretical_loss": 3.55304474640068, + "tokens_seen": 1333994496 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030093279839518553, + "loss": 2.7419, + "theoretical_loss": 3.5530290083172984, + "tokens_seen": 1334060032 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030092276830491477, + "loss": 2.7328, + "theoretical_loss": 3.5530132712235014, + "tokens_seen": 1334125568 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030091273821464395, + "loss": 2.6877, + "theoretical_loss": 3.552997535119177, + "tokens_seen": 1334191104 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030090270812437313, + "loss": 2.9013, + "theoretical_loss": 3.5529818000042157, + "tokens_seen": 1334256640 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003008926780341023, + "loss": 2.9225, + "theoretical_loss": 3.552966065878506, + "tokens_seen": 1334322176 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030088264794383155, + "loss": 2.83, + "theoretical_loss": 3.552950332741937, + "tokens_seen": 1334387712 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003008726178535607, + "loss": 2.8382, + "theoretical_loss": 3.552934600594398, + "tokens_seen": 1334453248 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003008625877632899, + "loss": 2.7904, + "theoretical_loss": 3.5529188694357785, + "tokens_seen": 1334518784 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030085255767301904, + "loss": 2.9353, + "theoretical_loss": 3.5529031392659682, + "tokens_seen": 1334584320 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030084252758274827, + "loss": 2.8243, + "theoretical_loss": 3.552887410084855, + "tokens_seen": 1334649856 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030083249749247745, + "loss": 2.9692, + "theoretical_loss": 3.55287168189233, + "tokens_seen": 1334715392 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030082246740220663, + "loss": 2.7656, + "theoretical_loss": 3.552855954688281, + "tokens_seen": 1334780928 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003008124373119358, + "loss": 2.898, + "theoretical_loss": 3.552840228472598, + "tokens_seen": 1334846464 + }, + { + "epoch": 3.09, + "learning_rate": 0.000300802407221665, + "loss": 2.6686, + "theoretical_loss": 3.5528245032451706, + "tokens_seen": 1334912000 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007923771313942, + "loss": 2.8208, + "theoretical_loss": 3.552808779005888, + "tokens_seen": 1334977536 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007823470411234, + "loss": 2.9083, + "theoretical_loss": 3.5527930557546394, + "tokens_seen": 1335043072 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030077231695085254, + "loss": 2.6932, + "theoretical_loss": 3.552777333491314, + "tokens_seen": 1335108608 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007622868605818, + "loss": 2.7997, + "theoretical_loss": 3.552761612215802, + "tokens_seen": 1335174144 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007522567703109, + "loss": 2.711, + "theoretical_loss": 3.552745891927992, + "tokens_seen": 1335239680 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2134950, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.888380289077759, + "objective/train/theoretical_loss": 3.552741962010355, + "objective/train/tokens_used": 1355716064, + "theoretical_loss": 3.552741962010355, + "tokens_seen": 1335256064 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030074222668004014, + "loss": 2.9171, + "theoretical_loss": 3.5527301726277747, + "tokens_seen": 1335305216 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007321965897693, + "loss": 2.7915, + "theoretical_loss": 3.552714454315038, + "tokens_seen": 1335370752 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007221664994985, + "loss": 2.8698, + "theoretical_loss": 3.5526987369896723, + "tokens_seen": 1335436288 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007121364092277, + "loss": 2.8699, + "theoretical_loss": 3.5526830206515676, + "tokens_seen": 1335501824 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003007021063189569, + "loss": 2.9755, + "theoretical_loss": 3.552667305300612, + "tokens_seen": 1335567360 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030069207622868604, + "loss": 2.9694, + "theoretical_loss": 3.5526515909366965, + "tokens_seen": 1335632896 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003006820461384153, + "loss": 2.8146, + "theoretical_loss": 3.5526358775597098, + "tokens_seen": 1335698432 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003006720160481444, + "loss": 2.8677, + "theoretical_loss": 3.552620165169542, + "tokens_seen": 1335763968 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030066198595787364, + "loss": 2.9331, + "theoretical_loss": 3.552604453766083, + "tokens_seen": 1335829504 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003006519558676028, + "loss": 2.8527, + "theoretical_loss": 3.5525887433492214, + "tokens_seen": 1335895040 + }, + { + "epoch": 3.09, + "learning_rate": 0.000300641925777332, + "loss": 2.8186, + "theoretical_loss": 3.552573033918848, + "tokens_seen": 1335960576 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003006318956870612, + "loss": 2.8774, + "theoretical_loss": 3.552557325474851, + "tokens_seen": 1336026112 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030062186559679036, + "loss": 2.7662, + "theoretical_loss": 3.5525416180171216, + "tokens_seen": 1336091648 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030061183550651955, + "loss": 2.7693, + "theoretical_loss": 3.5525259115455485, + "tokens_seen": 1336157184 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003006018054162488, + "loss": 2.7707, + "theoretical_loss": 3.552510206060022, + "tokens_seen": 1336222720 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003005917753259779, + "loss": 2.8836, + "theoretical_loss": 3.5524945015604317, + "tokens_seen": 1336288256 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030058174523570714, + "loss": 2.6994, + "theoretical_loss": 3.5524787980466668, + "tokens_seen": 1336353792 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030057171514543627, + "loss": 2.7297, + "theoretical_loss": 3.5524630955186183, + "tokens_seen": 1336419328 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003005616850551655, + "loss": 2.8078, + "theoretical_loss": 3.552447393976175, + "tokens_seen": 1336484864 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003005516549648947, + "loss": 2.7422, + "theoretical_loss": 3.5524316934192264, + "tokens_seen": 1336550400 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030054162487462387, + "loss": 2.9593, + "theoretical_loss": 3.5524159938476636, + "tokens_seen": 1336615936 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030053159478435305, + "loss": 2.8129, + "theoretical_loss": 3.552400295261376, + "tokens_seen": 1336681472 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003005215646940823, + "loss": 2.951, + "theoretical_loss": 3.5523845976602524, + "tokens_seen": 1336747008 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003005115346038114, + "loss": 2.8289, + "theoretical_loss": 3.5523689010441837, + "tokens_seen": 1336812544 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030050150451354065, + "loss": 2.961, + "theoretical_loss": 3.55235320541306, + "tokens_seen": 1336878080 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2137671, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9193994998931885, + "objective/train/theoretical_loss": 3.5523492816591635, + "objective/train/tokens_used": 1357354464, + "theoretical_loss": 3.5523492816591635, + "tokens_seen": 1336894464 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003004914744232698, + "loss": 2.7801, + "theoretical_loss": 3.5523375107667703, + "tokens_seen": 1336943616 + }, + { + "epoch": 3.09, + "learning_rate": 0.000300481444332999, + "loss": 2.9455, + "theoretical_loss": 3.5523218171052053, + "tokens_seen": 1337009152 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003004714142427282, + "loss": 2.6817, + "theoretical_loss": 3.552306124428255, + "tokens_seen": 1337074688 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030046138415245737, + "loss": 2.7575, + "theoretical_loss": 3.5522904327358082, + "tokens_seen": 1337140224 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030045135406218655, + "loss": 2.8345, + "theoretical_loss": 3.5522747420277563, + "tokens_seen": 1337205760 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030044132397191573, + "loss": 2.74, + "theoretical_loss": 3.552259052303989, + "tokens_seen": 1337271296 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003004312938816449, + "loss": 2.7362, + "theoretical_loss": 3.5522433635643953, + "tokens_seen": 1337336832 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030042126379137415, + "loss": 2.7454, + "theoretical_loss": 3.552227675808867, + "tokens_seen": 1337402368 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003004112337011033, + "loss": 2.7976, + "theoretical_loss": 3.5522119890372927, + "tokens_seen": 1337467904 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003004012036108325, + "loss": 2.901, + "theoretical_loss": 3.5521963032495627, + "tokens_seen": 1337533440 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003003911735205617, + "loss": 2.5641, + "theoretical_loss": 3.5521806184455675, + "tokens_seen": 1337598976 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003003811434302909, + "loss": 2.8427, + "theoretical_loss": 3.552164934625197, + "tokens_seen": 1337664512 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030037111334002006, + "loss": 2.738, + "theoretical_loss": 3.5521492517883417, + "tokens_seen": 1337730048 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030036108324974924, + "loss": 2.938, + "theoretical_loss": 3.5521335699348913, + "tokens_seen": 1337795584 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003003510531594784, + "loss": 2.9089, + "theoretical_loss": 3.552117889064736, + "tokens_seen": 1337861120 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030034102306920765, + "loss": 2.9551, + "theoretical_loss": 3.5521022091777663, + "tokens_seen": 1337926656 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003003309929789368, + "loss": 2.7417, + "theoretical_loss": 3.552086530273872, + "tokens_seen": 1337992192 + }, + { + "epoch": 3.09, + "learning_rate": 0.000300320962888666, + "loss": 2.7449, + "theoretical_loss": 3.5520708523529434, + "tokens_seen": 1338057728 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030031093279839514, + "loss": 2.8452, + "theoretical_loss": 3.552055175414871, + "tokens_seen": 1338123264 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003003009027081244, + "loss": 2.8471, + "theoretical_loss": 3.5520394994595446, + "tokens_seen": 1338188800 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003002908726178536, + "loss": 2.7211, + "theoretical_loss": 3.552023824486855, + "tokens_seen": 1338254336 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030028084252758274, + "loss": 2.7511, + "theoretical_loss": 3.5520081504966923, + "tokens_seen": 1338319872 + }, + { + "epoch": 3.09, + "learning_rate": 0.000300270812437312, + "loss": 2.9812, + "theoretical_loss": 3.5519924774889464, + "tokens_seen": 1338385408 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003002607823470411, + "loss": 2.9592, + "theoretical_loss": 3.551976805463508, + "tokens_seen": 1338450944 + }, + { + "epoch": 3.09, + "learning_rate": 0.00030025075225677034, + "loss": 2.8285, + "theoretical_loss": 3.551961134420268, + "tokens_seen": 1338516480 + }, + { + "epoch": 3.09, + "objective/train/docs_used": 2139095, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8950493335723877, + "objective/train/theoretical_loss": 3.5519572168129137, + "objective/train/tokens_used": 1358992864, + "theoretical_loss": 3.5519572168129137, + "tokens_seen": 1338532864 + }, + { + "epoch": 3.09, + "learning_rate": 0.0003002407221664995, + "loss": 2.9151, + "theoretical_loss": 3.551945464359116, + "tokens_seen": 1338582016 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003002306920762287, + "loss": 2.8558, + "theoretical_loss": 3.551929795279942, + "tokens_seen": 1338647552 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003002206619859579, + "loss": 2.7637, + "theoretical_loss": 3.5519141271826378, + "tokens_seen": 1338713088 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003002106318956871, + "loss": 2.9203, + "theoretical_loss": 3.551898460067092, + "tokens_seen": 1338778624 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030020060180541624, + "loss": 2.9712, + "theoretical_loss": 3.5518827939331965, + "tokens_seen": 1338844160 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003001905717151455, + "loss": 2.9447, + "theoretical_loss": 3.5518671287808408, + "tokens_seen": 1338909696 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003001805416248746, + "loss": 2.7697, + "theoretical_loss": 3.551851464609916, + "tokens_seen": 1338975232 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030017051153460384, + "loss": 2.8369, + "theoretical_loss": 3.551835801420313, + "tokens_seen": 1339040768 + }, + { + "epoch": 3.1, + "learning_rate": 0.000300160481444333, + "loss": 2.8305, + "theoretical_loss": 3.5518201392119213, + "tokens_seen": 1339106304 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003001504513540622, + "loss": 2.8793, + "theoretical_loss": 3.551804477984631, + "tokens_seen": 1339171840 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003001404212637914, + "loss": 2.8253, + "theoretical_loss": 3.551788817738334, + "tokens_seen": 1339237376 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030013039117352056, + "loss": 2.8417, + "theoretical_loss": 3.5517731584729204, + "tokens_seen": 1339302912 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030012036108324975, + "loss": 2.8206, + "theoretical_loss": 3.5517575001882804, + "tokens_seen": 1339368448 + }, + { + "epoch": 3.1, + "learning_rate": 0.000300110330992979, + "loss": 2.7448, + "theoretical_loss": 3.5517418428843053, + "tokens_seen": 1339433984 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003001003009027081, + "loss": 2.8478, + "theoretical_loss": 3.551726186560885, + "tokens_seen": 1339499520 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030009027081243734, + "loss": 2.9788, + "theoretical_loss": 3.55171053121791, + "tokens_seen": 1339565056 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030008024072216647, + "loss": 2.6983, + "theoretical_loss": 3.5516948768552714, + "tokens_seen": 1339630592 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003000702106318957, + "loss": 2.9349, + "theoretical_loss": 3.55167922347286, + "tokens_seen": 1339696128 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003000601805416249, + "loss": 2.8514, + "theoretical_loss": 3.5516635710705664, + "tokens_seen": 1339761664 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030005015045135407, + "loss": 2.9047, + "theoretical_loss": 3.55164791964828, + "tokens_seen": 1339827200 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030004012036108325, + "loss": 2.9597, + "theoretical_loss": 3.5516322692058937, + "tokens_seen": 1339892736 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003000300902708125, + "loss": 2.8607, + "theoretical_loss": 3.551616619743297, + "tokens_seen": 1339958272 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003000200601805416, + "loss": 2.867, + "theoretical_loss": 3.5516009712603807, + "tokens_seen": 1340023808 + }, + { + "epoch": 3.1, + "learning_rate": 0.00030001003009027085, + "loss": 2.9036, + "theoretical_loss": 3.5515853237570356, + "tokens_seen": 1340089344 + }, + { + "epoch": 3.1, + "learning_rate": 0.0003, + "loss": 2.9448, + "theoretical_loss": 3.551569677233153, + "tokens_seen": 1340154880 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2142005, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9415364265441895, + "objective/train/theoretical_loss": 3.55156576575521, + "objective/train/tokens_used": 1360631264, + "theoretical_loss": 3.55156576575521, + "tokens_seen": 1340171264 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002999899699097292, + "loss": 2.9066, + "theoretical_loss": 3.5515540316886223, + "tokens_seen": 1340220416 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002999799398194584, + "loss": 2.8647, + "theoretical_loss": 3.551538387123336, + "tokens_seen": 1340285952 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029996990972918757, + "loss": 2.9797, + "theoretical_loss": 3.5515227435371837, + "tokens_seen": 1340351488 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029995987963891675, + "loss": 2.8003, + "theoretical_loss": 3.551507100930057, + "tokens_seen": 1340417024 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029994984954864593, + "loss": 2.8656, + "theoretical_loss": 3.5514914593018463, + "tokens_seen": 1340482560 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002999398194583751, + "loss": 2.832, + "theoretical_loss": 3.5514758186524427, + "tokens_seen": 1340548096 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029992978936810435, + "loss": 2.9113, + "theoretical_loss": 3.5514601789817375, + "tokens_seen": 1340613632 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002999197592778335, + "loss": 2.88, + "theoretical_loss": 3.5514445402896206, + "tokens_seen": 1340679168 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002999097291875627, + "loss": 2.852, + "theoretical_loss": 3.551428902575984, + "tokens_seen": 1340744704 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002998996990972919, + "loss": 2.7325, + "theoretical_loss": 3.551413265840718, + "tokens_seen": 1340810240 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002998896690070211, + "loss": 3.0182, + "theoretical_loss": 3.551397630083714, + "tokens_seen": 1340875776 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029987963891675026, + "loss": 2.7655, + "theoretical_loss": 3.5513819953048626, + "tokens_seen": 1340941312 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029986960882647944, + "loss": 2.8536, + "theoretical_loss": 3.551366361504055, + "tokens_seen": 1341006848 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002998595787362086, + "loss": 2.8836, + "theoretical_loss": 3.551350728681182, + "tokens_seen": 1341072384 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029984954864593785, + "loss": 2.889, + "theoretical_loss": 3.5513350968361346, + "tokens_seen": 1341137920 + }, + { + "epoch": 3.1, + "learning_rate": 0.000299839518555667, + "loss": 2.7746, + "theoretical_loss": 3.5513194659688043, + "tokens_seen": 1341203456 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002998294884653962, + "loss": 2.7187, + "theoretical_loss": 3.5513038360790823, + "tokens_seen": 1341268992 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029981945837512534, + "loss": 2.7258, + "theoretical_loss": 3.5512882071668592, + "tokens_seen": 1341334528 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002998094282848546, + "loss": 2.7577, + "theoretical_loss": 3.551272579232026, + "tokens_seen": 1341400064 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029979939819458376, + "loss": 2.8045, + "theoretical_loss": 3.551256952274475, + "tokens_seen": 1341465600 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029978936810431294, + "loss": 2.8292, + "theoretical_loss": 3.5512413262940954, + "tokens_seen": 1341531136 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002997793380140421, + "loss": 2.8583, + "theoretical_loss": 3.5512257012907797, + "tokens_seen": 1341596672 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002997693079237713, + "loss": 2.7276, + "theoretical_loss": 3.551210077264419, + "tokens_seen": 1341662208 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002997592778335005, + "loss": 2.9128, + "theoretical_loss": 3.551194454214904, + "tokens_seen": 1341727744 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002997492477432297, + "loss": 2.8395, + "theoretical_loss": 3.5511788321421265, + "tokens_seen": 1341793280 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2144837, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.459381103515625, + "objective/train/theoretical_loss": 3.5511749267765342, + "objective/train/tokens_used": 1362269664, + "theoretical_loss": 3.5511749267765342, + "tokens_seen": 1341809664 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029973921765295885, + "loss": 2.8207, + "theoretical_loss": 3.5511632110459774, + "tokens_seen": 1341858816 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002997291875626881, + "loss": 2.9319, + "theoretical_loss": 3.551147590926348, + "tokens_seen": 1341924352 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029971915747241726, + "loss": 2.9642, + "theoretical_loss": 3.5511319717831293, + "tokens_seen": 1341989888 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029970912738214644, + "loss": 2.9385, + "theoretical_loss": 3.551116353616213, + "tokens_seen": 1342055424 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996990972918756, + "loss": 2.9847, + "theoretical_loss": 3.5511007364254903, + "tokens_seen": 1342120960 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996890672016048, + "loss": 2.8249, + "theoretical_loss": 3.5510851202108524, + "tokens_seen": 1342186496 + }, + { + "epoch": 3.1, + "learning_rate": 0.000299679037111334, + "loss": 2.7226, + "theoretical_loss": 3.551069504972191, + "tokens_seen": 1342252032 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996690070210632, + "loss": 2.6069, + "theoretical_loss": 3.5510538907093974, + "tokens_seen": 1342317568 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029965897693079235, + "loss": 2.8062, + "theoretical_loss": 3.5510382774223617, + "tokens_seen": 1342383104 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996489468405216, + "loss": 2.9381, + "theoretical_loss": 3.5510226651109775, + "tokens_seen": 1342448640 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996389167502507, + "loss": 2.811, + "theoretical_loss": 3.551007053775134, + "tokens_seen": 1342514176 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029962888665997995, + "loss": 2.8154, + "theoretical_loss": 3.5509914434147243, + "tokens_seen": 1342579712 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029961885656970913, + "loss": 2.8525, + "theoretical_loss": 3.5509758340296393, + "tokens_seen": 1342645248 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002996088264794383, + "loss": 2.7764, + "theoretical_loss": 3.55096022561977, + "tokens_seen": 1342710784 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995987963891675, + "loss": 2.8289, + "theoretical_loss": 3.5509446181850084, + "tokens_seen": 1342776320 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029958876629889667, + "loss": 2.9749, + "theoretical_loss": 3.5509290117252457, + "tokens_seen": 1342841856 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029957873620862585, + "loss": 2.7604, + "theoretical_loss": 3.5509134062403738, + "tokens_seen": 1342907392 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995687061183551, + "loss": 2.8344, + "theoretical_loss": 3.550897801730284, + "tokens_seen": 1342972928 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995586760280842, + "loss": 2.9502, + "theoretical_loss": 3.5508821981948673, + "tokens_seen": 1343038464 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029954864593781345, + "loss": 2.804, + "theoretical_loss": 3.5508665956340164, + "tokens_seen": 1343104000 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995386158475427, + "loss": 2.7066, + "theoretical_loss": 3.550850994047622, + "tokens_seen": 1343169536 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995285857572718, + "loss": 2.7256, + "theoretical_loss": 3.550835393435576, + "tokens_seen": 1343235072 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029951855566700105, + "loss": 2.9182, + "theoretical_loss": 3.5508197937977704, + "tokens_seen": 1343300608 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002995085255767302, + "loss": 2.8639, + "theoretical_loss": 3.550804195134096, + "tokens_seen": 1343366144 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994984954864594, + "loss": 2.9604, + "theoretical_loss": 3.550788597444445, + "tokens_seen": 1343431680 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2147756, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8367528915405273, + "objective/train/theoretical_loss": 3.55078469817421, + "objective/train/tokens_used": 1363908064, + "theoretical_loss": 3.55078469817421, + "tokens_seen": 1343448064 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994884653961886, + "loss": 2.7288, + "theoretical_loss": 3.5507730007287086, + "tokens_seen": 1343497216 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029947843530591777, + "loss": 2.7401, + "theoretical_loss": 3.550757404986779, + "tokens_seen": 1343562752 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029946840521564695, + "loss": 2.8976, + "theoretical_loss": 3.5507418102185477, + "tokens_seen": 1343628288 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029945837512537613, + "loss": 2.8984, + "theoretical_loss": 3.5507262164239073, + "tokens_seen": 1343693824 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994483450351053, + "loss": 3.0403, + "theoretical_loss": 3.5507106236027477, + "tokens_seen": 1343759360 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029943831494483455, + "loss": 2.918, + "theoretical_loss": 3.550695031754962, + "tokens_seen": 1343824896 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994282848545637, + "loss": 2.8882, + "theoretical_loss": 3.550679440880441, + "tokens_seen": 1343890432 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994182547642929, + "loss": 2.7882, + "theoretical_loss": 3.5506638509790784, + "tokens_seen": 1343955968 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002994082246740221, + "loss": 2.9573, + "theoretical_loss": 3.550648262050764, + "tokens_seen": 1344021504 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002993981945837513, + "loss": 2.9168, + "theoretical_loss": 3.5506326740953904, + "tokens_seen": 1344087040 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029938816449348046, + "loss": 2.8656, + "theoretical_loss": 3.5506170871128493, + "tokens_seen": 1344152576 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029937813440320964, + "loss": 2.814, + "theoretical_loss": 3.5506015011030327, + "tokens_seen": 1344218112 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002993681043129388, + "loss": 2.8484, + "theoretical_loss": 3.550585916065832, + "tokens_seen": 1344283648 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029935807422266805, + "loss": 2.8618, + "theoretical_loss": 3.55057033200114, + "tokens_seen": 1344349184 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002993480441323972, + "loss": 2.9495, + "theoretical_loss": 3.5505547489088483, + "tokens_seen": 1344414720 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002993380140421264, + "loss": 2.8482, + "theoretical_loss": 3.550539166788848, + "tokens_seen": 1344480256 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029932798395185554, + "loss": 2.8322, + "theoretical_loss": 3.5505235856410327, + "tokens_seen": 1344545792 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002993179538615848, + "loss": 2.8367, + "theoretical_loss": 3.550508005465293, + "tokens_seen": 1344611328 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029930792377131396, + "loss": 2.627, + "theoretical_loss": 3.5504924262615205, + "tokens_seen": 1344676864 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029929789368104314, + "loss": 2.8203, + "theoretical_loss": 3.5504768480296085, + "tokens_seen": 1344742400 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992878635907723, + "loss": 2.7824, + "theoretical_loss": 3.550461270769448, + "tokens_seen": 1344807936 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992778335005015, + "loss": 2.6525, + "theoretical_loss": 3.550445694480932, + "tokens_seen": 1344873472 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992678034102307, + "loss": 2.7646, + "theoretical_loss": 3.550430119163952, + "tokens_seen": 1344939008 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992577733199599, + "loss": 2.8959, + "theoretical_loss": 3.5504145448184, + "tokens_seen": 1345004544 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029924774322968905, + "loss": 2.7762, + "theoretical_loss": 3.550398971444168, + "tokens_seen": 1345070080 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2150607, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.719716787338257, + "objective/train/theoretical_loss": 3.5503950782523663, + "objective/train/tokens_used": 1365546464, + "theoretical_loss": 3.5503950782523663, + "tokens_seen": 1345086464 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992377131394183, + "loss": 2.8341, + "theoretical_loss": 3.5503833990411486, + "tokens_seen": 1345135616 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029922768304914746, + "loss": 2.8661, + "theoretical_loss": 3.5503678276092336, + "tokens_seen": 1345201152 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029921765295887664, + "loss": 2.7176, + "theoretical_loss": 3.5503522571483153, + "tokens_seen": 1345266688 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002992076228686058, + "loss": 2.8694, + "theoretical_loss": 3.5503366876582847, + "tokens_seen": 1345332224 + }, + { + "epoch": 3.1, + "learning_rate": 0.000299197592778335, + "loss": 2.8371, + "theoretical_loss": 3.550321119139036, + "tokens_seen": 1345397760 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991875626880642, + "loss": 2.9505, + "theoretical_loss": 3.55030555159046, + "tokens_seen": 1345463296 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991775325977934, + "loss": 2.6622, + "theoretical_loss": 3.550289985012449, + "tokens_seen": 1345528832 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029916750250752255, + "loss": 2.6438, + "theoretical_loss": 3.5502744194048956, + "tokens_seen": 1345594368 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991574724172518, + "loss": 2.9432, + "theoretical_loss": 3.550258854767692, + "tokens_seen": 1345659904 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991474423269809, + "loss": 2.9009, + "theoretical_loss": 3.5502432911007302, + "tokens_seen": 1345725440 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029913741223671015, + "loss": 2.9049, + "theoretical_loss": 3.550227728403903, + "tokens_seen": 1345790976 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029912738214643933, + "loss": 2.8369, + "theoretical_loss": 3.5502121666771025, + "tokens_seen": 1345856512 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991173520561685, + "loss": 2.8441, + "theoretical_loss": 3.5501966059202203, + "tokens_seen": 1345922048 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002991073219658977, + "loss": 2.7423, + "theoretical_loss": 3.550181046133149, + "tokens_seen": 1345987584 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029909729187562687, + "loss": 2.8551, + "theoretical_loss": 3.550165487315782, + "tokens_seen": 1346053120 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029908726178535605, + "loss": 2.8951, + "theoretical_loss": 3.5501499294680103, + "tokens_seen": 1346118656 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002990772316950853, + "loss": 2.9154, + "theoretical_loss": 3.550134372589727, + "tokens_seen": 1346184192 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002990672016048144, + "loss": 2.9017, + "theoretical_loss": 3.5501188166808246, + "tokens_seen": 1346249728 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029905717151454365, + "loss": 2.7133, + "theoretical_loss": 3.550103261741195, + "tokens_seen": 1346315264 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029904714142427283, + "loss": 2.8532, + "theoretical_loss": 3.5500877077707305, + "tokens_seen": 1346380800 + }, + { + "epoch": 3.1, + "learning_rate": 0.000299037111334002, + "loss": 2.8705, + "theoretical_loss": 3.5500721547693246, + "tokens_seen": 1346446336 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002990270812437312, + "loss": 2.8323, + "theoretical_loss": 3.550056602736869, + "tokens_seen": 1346511872 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002990170511534604, + "loss": 2.8457, + "theoretical_loss": 3.5500410516732552, + "tokens_seen": 1346577408 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029900702106318956, + "loss": 2.8241, + "theoretical_loss": 3.5500255015783777, + "tokens_seen": 1346642944 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989969909729188, + "loss": 2.7719, + "theoretical_loss": 3.550009952452128, + "tokens_seen": 1346708480 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0513622760772705, + "objective/train/theoretical_loss": 3.550006065321901, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.550006065321901, + "tokens_seen": 1346724864 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989869608826479, + "loss": 2.9796, + "theoretical_loss": 3.5499944042943987, + "tokens_seen": 1346774016 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029897693079237715, + "loss": 2.8765, + "theoretical_loss": 3.5499788571050823, + "tokens_seen": 1346839552 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989669007021063, + "loss": 2.7154, + "theoretical_loss": 3.5499633108840714, + "tokens_seen": 1346905088 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989568706118355, + "loss": 2.6827, + "theoretical_loss": 3.5499477656312584, + "tokens_seen": 1346970624 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989468405215647, + "loss": 2.9746, + "theoretical_loss": 3.549932221346536, + "tokens_seen": 1347036160 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989368104312939, + "loss": 2.6232, + "theoretical_loss": 3.5499166780297973, + "tokens_seen": 1347101696 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029892678034102306, + "loss": 2.6637, + "theoretical_loss": 3.5499011356809342, + "tokens_seen": 1347167232 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989167502507523, + "loss": 2.8785, + "theoretical_loss": 3.54988559429984, + "tokens_seen": 1347232768 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002989067201604814, + "loss": 2.7592, + "theoretical_loss": 3.5498700538864068, + "tokens_seen": 1347298304 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029889669007021066, + "loss": 2.7782, + "theoretical_loss": 3.5498545144405274, + "tokens_seen": 1347363840 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002988866599799398, + "loss": 2.8768, + "theoretical_loss": 3.5498389759620954, + "tokens_seen": 1347429376 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298876629889669, + "loss": 2.7205, + "theoretical_loss": 3.5498234384510017, + "tokens_seen": 1347494912 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002988665997993982, + "loss": 2.8132, + "theoretical_loss": 3.549807901907141, + "tokens_seen": 1347560448 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002988565697091274, + "loss": 2.8442, + "theoretical_loss": 3.5497923663304043, + "tokens_seen": 1347625984 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029884653961885656, + "loss": 2.9416, + "theoretical_loss": 3.5497768317206857, + "tokens_seen": 1347691520 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029883650952858574, + "loss": 2.7131, + "theoretical_loss": 3.549761298077878, + "tokens_seen": 1347757056 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002988264794383149, + "loss": 2.7877, + "theoretical_loss": 3.549745765401873, + "tokens_seen": 1347822592 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029881644934804416, + "loss": 2.8787, + "theoretical_loss": 3.5497302336925642, + "tokens_seen": 1347888128 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002988064192577733, + "loss": 2.6973, + "theoretical_loss": 3.549714702949844, + "tokens_seen": 1347953664 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987963891675025, + "loss": 2.8966, + "theoretical_loss": 3.5496991731736056, + "tokens_seen": 1348019200 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987863590772317, + "loss": 2.8086, + "theoretical_loss": 3.549683644363742, + "tokens_seen": 1348084736 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987763289869609, + "loss": 2.9631, + "theoretical_loss": 3.5496681165201456, + "tokens_seen": 1348150272 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987662988966901, + "loss": 2.7999, + "theoretical_loss": 3.54965258964271, + "tokens_seen": 1348215808 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029875626880641925, + "loss": 2.8481, + "theoretical_loss": 3.5496370637313275, + "tokens_seen": 1348281344 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987462387161485, + "loss": 2.9815, + "theoretical_loss": 3.549621538785891, + "tokens_seen": 1348346880 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.871561288833618, + "objective/train/theoretical_loss": 3.5496176577004483, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.5496176577004483, + "tokens_seen": 1348363264 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029873620862587766, + "loss": 2.7383, + "theoretical_loss": 3.549606014806294, + "tokens_seen": 1348412416 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029872617853560684, + "loss": 2.8191, + "theoretical_loss": 3.5495904917924292, + "tokens_seen": 1348477952 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298716148445336, + "loss": 2.7421, + "theoretical_loss": 3.5495749697441896, + "tokens_seen": 1348543488 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002987061183550652, + "loss": 2.8438, + "theoretical_loss": 3.5495594486614683, + "tokens_seen": 1348609024 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002986960882647944, + "loss": 2.7563, + "theoretical_loss": 3.549543928544158, + "tokens_seen": 1348674560 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002986860581745236, + "loss": 2.9, + "theoretical_loss": 3.5495284093921518, + "tokens_seen": 1348740096 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029867602808425275, + "loss": 2.8191, + "theoretical_loss": 3.5495128912053433, + "tokens_seen": 1348805632 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298665997993982, + "loss": 2.7278, + "theoretical_loss": 3.549497373983625, + "tokens_seen": 1348871168 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002986559679037111, + "loss": 2.8186, + "theoretical_loss": 3.5494818577268905, + "tokens_seen": 1348936704 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029864593781344035, + "loss": 2.7267, + "theoretical_loss": 3.5494663424350326, + "tokens_seen": 1349002240 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029863590772316953, + "loss": 2.8383, + "theoretical_loss": 3.549450828107944, + "tokens_seen": 1349067776 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002986258776328987, + "loss": 2.7936, + "theoretical_loss": 3.549435314745519, + "tokens_seen": 1349133312 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002986158475426279, + "loss": 2.7992, + "theoretical_loss": 3.5494198023476495, + "tokens_seen": 1349198848 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029860581745235707, + "loss": 2.6329, + "theoretical_loss": 3.549404290914229, + "tokens_seen": 1349264384 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029859578736208625, + "loss": 2.8733, + "theoretical_loss": 3.5493887804451516, + "tokens_seen": 1349329920 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002985857572718155, + "loss": 2.9568, + "theoretical_loss": 3.5493732709403094, + "tokens_seen": 1349395456 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002985757271815446, + "loss": 2.8062, + "theoretical_loss": 3.5493577623995964, + "tokens_seen": 1349460992 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029856569709127385, + "loss": 2.8709, + "theoretical_loss": 3.549342254822905, + "tokens_seen": 1349526528 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029855566700100303, + "loss": 2.7127, + "theoretical_loss": 3.5493267482101296, + "tokens_seen": 1349592064 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002985456369107322, + "loss": 2.7183, + "theoretical_loss": 3.5493112425611626, + "tokens_seen": 1349657600 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002985356068204614, + "loss": 2.9329, + "theoretical_loss": 3.5492957378758976, + "tokens_seen": 1349723136 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002985255767301906, + "loss": 2.6905, + "theoretical_loss": 3.549280234154228, + "tokens_seen": 1349788672 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029851554663991976, + "loss": 2.676, + "theoretical_loss": 3.549264731396047, + "tokens_seen": 1349854208 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298505516549649, + "loss": 2.7193, + "theoretical_loss": 3.549249229601248, + "tokens_seen": 1349919744 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984954864593781, + "loss": 2.7678, + "theoretical_loss": 3.5492337287697238, + "tokens_seen": 1349985280 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8719966411590576, + "objective/train/theoretical_loss": 3.549229853712342, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.549229853712342, + "tokens_seen": 1350001664 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029848545636910735, + "loss": 2.7856, + "theoretical_loss": 3.5492182289013687, + "tokens_seen": 1350050816 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984754262788365, + "loss": 2.9184, + "theoretical_loss": 3.5492027299960762, + "tokens_seen": 1350116352 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984653961885657, + "loss": 2.8628, + "theoretical_loss": 3.5491872320537388, + "tokens_seen": 1350181888 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984553660982949, + "loss": 2.6603, + "theoretical_loss": 3.5491717350742498, + "tokens_seen": 1350247424 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984453360080241, + "loss": 2.7287, + "theoretical_loss": 3.549156239057504, + "tokens_seen": 1350312960 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029843530591775326, + "loss": 2.8585, + "theoretical_loss": 3.5491407440033935, + "tokens_seen": 1350378496 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984252758274825, + "loss": 2.7015, + "theoretical_loss": 3.549125249911813, + "tokens_seen": 1350444032 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002984152457372116, + "loss": 2.7359, + "theoretical_loss": 3.5491097567826553, + "tokens_seen": 1350509568 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029840521564694086, + "loss": 2.91, + "theoretical_loss": 3.549094264615814, + "tokens_seen": 1350575104 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029839518555667, + "loss": 2.8299, + "theoretical_loss": 3.549078773411182, + "tokens_seen": 1350640640 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983851554663992, + "loss": 2.6665, + "theoretical_loss": 3.549063283168654, + "tokens_seen": 1350706176 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983751253761284, + "loss": 2.7298, + "theoretical_loss": 3.549047793888123, + "tokens_seen": 1350771712 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983650952858576, + "loss": 2.9594, + "theoretical_loss": 3.549032305569483, + "tokens_seen": 1350837248 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029835506519558676, + "loss": 2.7771, + "theoretical_loss": 3.549016818212627, + "tokens_seen": 1350902784 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029834503510531594, + "loss": 2.7617, + "theoretical_loss": 3.5490013318174487, + "tokens_seen": 1350968320 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983350050150451, + "loss": 2.8749, + "theoretical_loss": 3.548985846383842, + "tokens_seen": 1351033856 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029832497492477436, + "loss": 2.853, + "theoretical_loss": 3.5489703619117003, + "tokens_seen": 1351099392 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983149448345035, + "loss": 2.8422, + "theoretical_loss": 3.548954878400918, + "tokens_seen": 1351164928 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002983049147442327, + "loss": 2.7417, + "theoretical_loss": 3.548939395851388, + "tokens_seen": 1351230464 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029829488465396185, + "loss": 2.7711, + "theoretical_loss": 3.5489239142630042, + "tokens_seen": 1351296000 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002982848545636911, + "loss": 2.7211, + "theoretical_loss": 3.54890843363566, + "tokens_seen": 1351361536 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029827482447342026, + "loss": 2.9005, + "theoretical_loss": 3.5488929539692498, + "tokens_seen": 1351427072 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029826479438314945, + "loss": 2.9417, + "theoretical_loss": 3.548877475263667, + "tokens_seen": 1351492608 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002982547642928786, + "loss": 2.8181, + "theoretical_loss": 3.548861997518806, + "tokens_seen": 1351558144 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029824473420260786, + "loss": 2.8442, + "theoretical_loss": 3.5488465207345588, + "tokens_seen": 1351623680 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.900495767593384, + "objective/train/theoretical_loss": 3.548842651688581, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.548842651688581, + "tokens_seen": 1351640064 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298234704112337, + "loss": 2.7328, + "theoretical_loss": 3.5488310449108216, + "tokens_seen": 1351689216 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002982246740220662, + "loss": 2.7624, + "theoretical_loss": 3.5488155700474864, + "tokens_seen": 1351754752 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029821464393179535, + "loss": 2.7572, + "theoretical_loss": 3.548800096144448, + "tokens_seen": 1351820288 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002982046138415246, + "loss": 2.7214, + "theoretical_loss": 3.5487846232015996, + "tokens_seen": 1351885824 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029819458375125377, + "loss": 2.7023, + "theoretical_loss": 3.5487691512188357, + "tokens_seen": 1351951360 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029818455366098295, + "loss": 2.8978, + "theoretical_loss": 3.5487536801960498, + "tokens_seen": 1352016896 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029817452357071213, + "loss": 2.5903, + "theoretical_loss": 3.548738210133136, + "tokens_seen": 1352082432 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002981644934804413, + "loss": 2.802, + "theoretical_loss": 3.548722741029988, + "tokens_seen": 1352147968 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002981544633901705, + "loss": 2.9428, + "theoretical_loss": 3.5487072728865003, + "tokens_seen": 1352213504 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029814443329989973, + "loss": 2.7555, + "theoretical_loss": 3.548691805702566, + "tokens_seen": 1352279040 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029813440320962885, + "loss": 2.7843, + "theoretical_loss": 3.54867633947808, + "tokens_seen": 1352344576 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002981243731193581, + "loss": 2.9574, + "theoretical_loss": 3.548660874212935, + "tokens_seen": 1352410112 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002981143430290872, + "loss": 2.7696, + "theoretical_loss": 3.5486454099070266, + "tokens_seen": 1352475648 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029810431293881645, + "loss": 2.8293, + "theoretical_loss": 3.548629946560248, + "tokens_seen": 1352541184 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029809428284854563, + "loss": 2.7404, + "theoretical_loss": 3.548614484172493, + "tokens_seen": 1352606720 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002980842527582748, + "loss": 2.8493, + "theoretical_loss": 3.548599022743656, + "tokens_seen": 1352672256 + }, + { + "epoch": 3.1, + "learning_rate": 0.000298074222668004, + "loss": 2.6158, + "theoretical_loss": 3.548583562273631, + "tokens_seen": 1352737792 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029806419257773323, + "loss": 2.857, + "theoretical_loss": 3.548568102762312, + "tokens_seen": 1352803328 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029805416248746236, + "loss": 2.5727, + "theoretical_loss": 3.5485526442095936, + "tokens_seen": 1352868864 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002980441323971916, + "loss": 2.833, + "theoretical_loss": 3.5485371866153694, + "tokens_seen": 1352934400 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002980341023069208, + "loss": 2.8016, + "theoretical_loss": 3.5485217299795337, + "tokens_seen": 1352999936 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029802407221664996, + "loss": 2.7272, + "theoretical_loss": 3.548506274301981, + "tokens_seen": 1353065472 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002980140421263792, + "loss": 2.7675, + "theoretical_loss": 3.5484908195826046, + "tokens_seen": 1353131008 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002980040120361083, + "loss": 2.8083, + "theoretical_loss": 3.5484753658212997, + "tokens_seen": 1353196544 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029799398194583755, + "loss": 2.821, + "theoretical_loss": 3.54845991301796, + "tokens_seen": 1353262080 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0145599842071533, + "objective/train/theoretical_loss": 3.548456049966795, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.548456049966795, + "tokens_seen": 1353278464 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979839518555667, + "loss": 2.9161, + "theoretical_loss": 3.5484444611724797, + "tokens_seen": 1353327616 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979739217652959, + "loss": 2.9212, + "theoretical_loss": 3.548429010284753, + "tokens_seen": 1353393152 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979638916750251, + "loss": 2.9828, + "theoretical_loss": 3.5484135603546743, + "tokens_seen": 1353458688 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979538615847543, + "loss": 2.892, + "theoretical_loss": 3.548398111382138, + "tokens_seen": 1353524224 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029794383149448346, + "loss": 2.9089, + "theoretical_loss": 3.5483826633670383, + "tokens_seen": 1353589760 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979338014042127, + "loss": 2.7785, + "theoretical_loss": 3.5483672163092694, + "tokens_seen": 1353655296 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979237713139418, + "loss": 2.8287, + "theoretical_loss": 3.5483517702087255, + "tokens_seen": 1353720832 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029791374122367106, + "loss": 2.7573, + "theoretical_loss": 3.5483363250653017, + "tokens_seen": 1353786368 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002979037111334002, + "loss": 2.9224, + "theoretical_loss": 3.5483208808788915, + "tokens_seen": 1353851904 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978936810431294, + "loss": 2.7411, + "theoretical_loss": 3.5483054376493897, + "tokens_seen": 1353917440 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978836509528586, + "loss": 2.7736, + "theoretical_loss": 3.5482899953766904, + "tokens_seen": 1353982976 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978736208625878, + "loss": 2.5422, + "theoretical_loss": 3.548274554060688, + "tokens_seen": 1354048512 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029786359077231696, + "loss": 2.7277, + "theoretical_loss": 3.5482591137012776, + "tokens_seen": 1354114048 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029785356068204614, + "loss": 2.9084, + "theoretical_loss": 3.5482436742983534, + "tokens_seen": 1354179584 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978435305917753, + "loss": 2.686, + "theoretical_loss": 3.548228235851809, + "tokens_seen": 1354245120 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029783350050150456, + "loss": 2.8831, + "theoretical_loss": 3.54821279836154, + "tokens_seen": 1354310656 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978234704112337, + "loss": 2.7349, + "theoretical_loss": 3.54819736182744, + "tokens_seen": 1354376192 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002978134403209629, + "loss": 2.7584, + "theoretical_loss": 3.548181926249404, + "tokens_seen": 1354441728 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029780341023069205, + "loss": 2.803, + "theoretical_loss": 3.5481664916273266, + "tokens_seen": 1354507264 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002977933801404213, + "loss": 2.7141, + "theoretical_loss": 3.5481510579611024, + "tokens_seen": 1354572800 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029778335005015046, + "loss": 2.7103, + "theoretical_loss": 3.5481356252506258, + "tokens_seen": 1354638336 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029777331995987965, + "loss": 2.8114, + "theoretical_loss": 3.548120193495791, + "tokens_seen": 1354703872 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029776328986960883, + "loss": 2.7209, + "theoretical_loss": 3.548104762696493, + "tokens_seen": 1354769408 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029775325977933806, + "loss": 2.9486, + "theoretical_loss": 3.5480893328526264, + "tokens_seen": 1354834944 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002977432296890672, + "loss": 2.6899, + "theoretical_loss": 3.5480739039640863, + "tokens_seen": 1354900480 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9221246242523193, + "objective/train/theoretical_loss": 3.5480700468912083, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.5480700468912083, + "tokens_seen": 1354916864 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002977331995987964, + "loss": 2.854, + "theoretical_loss": 3.548058476030766, + "tokens_seen": 1354966016 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029772316950852555, + "loss": 2.9151, + "theoretical_loss": 3.548043049052562, + "tokens_seen": 1355031552 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002977131394182548, + "loss": 2.7838, + "theoretical_loss": 3.548027623029367, + "tokens_seen": 1355097088 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029770310932798397, + "loss": 2.7238, + "theoretical_loss": 3.5480121979610777, + "tokens_seen": 1355162624 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029769307923771315, + "loss": 2.9384, + "theoretical_loss": 3.5479967738475873, + "tokens_seen": 1355228160 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029768304914744233, + "loss": 2.8168, + "theoretical_loss": 3.547981350688791, + "tokens_seen": 1355293696 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002976730190571715, + "loss": 2.7952, + "theoretical_loss": 3.547965928484583, + "tokens_seen": 1355359232 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002976629889669007, + "loss": 2.6475, + "theoretical_loss": 3.5479505072348596, + "tokens_seen": 1355424768 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029765295887662993, + "loss": 2.6339, + "theoretical_loss": 3.547935086939514, + "tokens_seen": 1355490304 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029764292878635905, + "loss": 2.6398, + "theoretical_loss": 3.5479196675984417, + "tokens_seen": 1355555840 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002976328986960883, + "loss": 2.7982, + "theoretical_loss": 3.5479042492115376, + "tokens_seen": 1355621376 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002976228686058174, + "loss": 2.86, + "theoretical_loss": 3.5478888317786965, + "tokens_seen": 1355686912 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029761283851554665, + "loss": 2.824, + "theoretical_loss": 3.5478734152998133, + "tokens_seen": 1355752448 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029760280842527583, + "loss": 2.8687, + "theoretical_loss": 3.5478579997747826, + "tokens_seen": 1355817984 + }, + { + "epoch": 3.1, + "learning_rate": 0.000297592778335005, + "loss": 2.8338, + "theoretical_loss": 3.5478425852034987, + "tokens_seen": 1355883520 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002975827482447342, + "loss": 2.9266, + "theoretical_loss": 3.5478271715858574, + "tokens_seen": 1355949056 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029757271815446343, + "loss": 2.83, + "theoretical_loss": 3.547811758921754, + "tokens_seen": 1356014592 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029756268806419256, + "loss": 2.794, + "theoretical_loss": 3.547796347211082, + "tokens_seen": 1356080128 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002975526579739218, + "loss": 2.8902, + "theoretical_loss": 3.547780936453738, + "tokens_seen": 1356145664 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002975426278836509, + "loss": 2.8651, + "theoretical_loss": 3.5477655266496155, + "tokens_seen": 1356211200 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029753259779338016, + "loss": 2.881, + "theoretical_loss": 3.54775011779861, + "tokens_seen": 1356276736 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029752256770310934, + "loss": 3.0063, + "theoretical_loss": 3.547734709900617, + "tokens_seen": 1356342272 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002975125376128385, + "loss": 2.9214, + "theoretical_loss": 3.5477193029555307, + "tokens_seen": 1356407808 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002975025075225677, + "loss": 2.881, + "theoretical_loss": 3.547703896963247, + "tokens_seen": 1356473344 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974924774322969, + "loss": 2.8873, + "theoretical_loss": 3.5476884919236604, + "tokens_seen": 1356538880 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.828191041946411, + "objective/train/theoretical_loss": 3.5476846408126104, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.5476846408126104, + "tokens_seen": 1356555264 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029748244734202606, + "loss": 2.7115, + "theoretical_loss": 3.547673087836666, + "tokens_seen": 1356604416 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974724172517553, + "loss": 2.7624, + "theoretical_loss": 3.547657684702159, + "tokens_seen": 1356669952 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974623871614844, + "loss": 2.7419, + "theoretical_loss": 3.547642282520034, + "tokens_seen": 1356735488 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029745235707121366, + "loss": 2.85, + "theoretical_loss": 3.547626881290187, + "tokens_seen": 1356801024 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974423269809428, + "loss": 2.9105, + "theoretical_loss": 3.5476114810125123, + "tokens_seen": 1356866560 + }, + { + "epoch": 3.1, + "learning_rate": 0.000297432296890672, + "loss": 2.5832, + "theoretical_loss": 3.547596081686906, + "tokens_seen": 1356932096 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974222668004012, + "loss": 2.7483, + "theoretical_loss": 3.5475806833132624, + "tokens_seen": 1356997632 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002974122367101304, + "loss": 2.752, + "theoretical_loss": 3.547565285891477, + "tokens_seen": 1357063168 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029740220661985956, + "loss": 2.6895, + "theoretical_loss": 3.547549889421445, + "tokens_seen": 1357128704 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973921765295888, + "loss": 2.6936, + "theoretical_loss": 3.5475344939030613, + "tokens_seen": 1357194240 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973821464393179, + "loss": 2.7834, + "theoretical_loss": 3.5475190993362213, + "tokens_seen": 1357259776 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029737211634904716, + "loss": 2.8298, + "theoretical_loss": 3.5475037057208207, + "tokens_seen": 1357325312 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973620862587763, + "loss": 2.8043, + "theoretical_loss": 3.5474883130567543, + "tokens_seen": 1357390848 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973520561685055, + "loss": 2.8887, + "theoretical_loss": 3.5474729213439176, + "tokens_seen": 1357456384 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973420260782347, + "loss": 3.0134, + "theoretical_loss": 3.547457530582206, + "tokens_seen": 1357521920 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002973319959879639, + "loss": 2.7589, + "theoretical_loss": 3.547442140771514, + "tokens_seen": 1357587456 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029732196589769307, + "loss": 2.7464, + "theoretical_loss": 3.547426751911738, + "tokens_seen": 1357652992 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029731193580742225, + "loss": 2.8229, + "theoretical_loss": 3.547411364002773, + "tokens_seen": 1357718528 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029730190571715143, + "loss": 2.7052, + "theoretical_loss": 3.547395977044514, + "tokens_seen": 1357784064 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029729187562688067, + "loss": 2.7067, + "theoretical_loss": 3.5473805910368563, + "tokens_seen": 1357849600 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029728184553660985, + "loss": 2.8352, + "theoretical_loss": 3.5473652059796965, + "tokens_seen": 1357915136 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029727181544633903, + "loss": 2.9103, + "theoretical_loss": 3.5473498218729285, + "tokens_seen": 1357980672 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029726178535606826, + "loss": 2.7046, + "theoretical_loss": 3.5473344387164483, + "tokens_seen": 1358046208 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002972517552657974, + "loss": 2.8274, + "theoretical_loss": 3.5473190565101516, + "tokens_seen": 1358111744 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002972417251755266, + "loss": 2.5921, + "theoretical_loss": 3.5473036752539335, + "tokens_seen": 1358177280 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7137787342071533, + "objective/train/theoretical_loss": 3.547299830088317, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.547299830088317, + "tokens_seen": 1358193664 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029723169508525575, + "loss": 2.8665, + "theoretical_loss": 3.54728829494769, + "tokens_seen": 1358242816 + }, + { + "epoch": 3.1, + "learning_rate": 0.000297221664994985, + "loss": 2.715, + "theoretical_loss": 3.547272915591316, + "tokens_seen": 1358308352 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029721163490471417, + "loss": 2.6964, + "theoretical_loss": 3.5472575371847075, + "tokens_seen": 1358373888 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029720160481444335, + "loss": 2.5988, + "theoretical_loss": 3.5472421597277597, + "tokens_seen": 1358439424 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029719157472417253, + "loss": 2.7594, + "theoretical_loss": 3.547226783220368, + "tokens_seen": 1358504960 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002971815446339017, + "loss": 2.7211, + "theoretical_loss": 3.547211407662428, + "tokens_seen": 1358570496 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002971715145436309, + "loss": 2.8081, + "theoretical_loss": 3.547196033053836, + "tokens_seen": 1358636032 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029716148445336013, + "loss": 2.904, + "theoretical_loss": 3.5471806593944866, + "tokens_seen": 1358701568 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029715145436308926, + "loss": 2.8587, + "theoretical_loss": 3.5471652866842764, + "tokens_seen": 1358767104 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002971414242728185, + "loss": 2.8911, + "theoretical_loss": 3.5471499149231, + "tokens_seen": 1358832640 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002971313941825476, + "loss": 2.7211, + "theoretical_loss": 3.547134544110854, + "tokens_seen": 1358898176 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029712136409227685, + "loss": 2.845, + "theoretical_loss": 3.5471191742474337, + "tokens_seen": 1358963712 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029711133400200603, + "loss": 2.8448, + "theoretical_loss": 3.547103805332734, + "tokens_seen": 1359029248 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002971013039117352, + "loss": 2.7719, + "theoretical_loss": 3.547088437366652, + "tokens_seen": 1359094784 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002970912738214644, + "loss": 2.8694, + "theoretical_loss": 3.5470730703490823, + "tokens_seen": 1359160320 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029708124373119363, + "loss": 2.6143, + "theoretical_loss": 3.547057704279921, + "tokens_seen": 1359225856 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029707121364092276, + "loss": 2.6948, + "theoretical_loss": 3.5470423391590638, + "tokens_seen": 1359291392 + }, + { + "epoch": 3.1, + "learning_rate": 0.000297061183550652, + "loss": 2.7139, + "theoretical_loss": 3.547026974986407, + "tokens_seen": 1359356928 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002970511534603811, + "loss": 2.7995, + "theoretical_loss": 3.547011611761845, + "tokens_seen": 1359422464 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029704112337011036, + "loss": 2.7438, + "theoretical_loss": 3.546996249485275, + "tokens_seen": 1359488000 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029703109327983954, + "loss": 2.823, + "theoretical_loss": 3.5469808881565927, + "tokens_seen": 1359553536 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002970210631895687, + "loss": 2.6982, + "theoretical_loss": 3.546965527775693, + "tokens_seen": 1359619072 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002970110330992979, + "loss": 2.6707, + "theoretical_loss": 3.5469501683424727, + "tokens_seen": 1359684608 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002970010030090271, + "loss": 2.8138, + "theoretical_loss": 3.5469348098568267, + "tokens_seen": 1359750144 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029699097291875626, + "loss": 2.9248, + "theoretical_loss": 3.546919452318652, + "tokens_seen": 1359815680 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.726388454437256, + "objective/train/theoretical_loss": 3.546915613082138, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.546915613082138, + "tokens_seen": 1359832064 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002969809428284855, + "loss": 2.8377, + "theoretical_loss": 3.546904095727843, + "tokens_seen": 1359881216 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002969709127382146, + "loss": 2.7463, + "theoretical_loss": 3.546888740084297, + "tokens_seen": 1359946752 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029696088264794386, + "loss": 2.7366, + "theoretical_loss": 3.5468733853879097, + "tokens_seen": 1360012288 + }, + { + "epoch": 3.1, + "learning_rate": 0.000296950852557673, + "loss": 2.8015, + "theoretical_loss": 3.5468580316385765, + "tokens_seen": 1360077824 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002969408224674022, + "loss": 2.7509, + "theoretical_loss": 3.5468426788361938, + "tokens_seen": 1360143360 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002969307923771314, + "loss": 2.6999, + "theoretical_loss": 3.546827326980657, + "tokens_seen": 1360208896 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002969207622868606, + "loss": 2.8512, + "theoretical_loss": 3.546811976071863, + "tokens_seen": 1360274432 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029691073219658976, + "loss": 2.7095, + "theoretical_loss": 3.546796626109707, + "tokens_seen": 1360339968 + }, + { + "epoch": 3.1, + "learning_rate": 0.000296900702106319, + "loss": 2.9024, + "theoretical_loss": 3.5467812770940847, + "tokens_seen": 1360405504 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002968906720160481, + "loss": 2.7626, + "theoretical_loss": 3.5467659290248936, + "tokens_seen": 1360471040 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029688064192577736, + "loss": 2.9433, + "theoretical_loss": 3.5467505819020286, + "tokens_seen": 1360536576 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002968706118355065, + "loss": 2.8412, + "theoretical_loss": 3.5467352357253863, + "tokens_seen": 1360602112 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002968605817452357, + "loss": 2.8372, + "theoretical_loss": 3.5467198904948622, + "tokens_seen": 1360667648 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002968505516549649, + "loss": 2.6435, + "theoretical_loss": 3.546704546210353, + "tokens_seen": 1360733184 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002968405215646941, + "loss": 2.8434, + "theoretical_loss": 3.5466892028717547, + "tokens_seen": 1360798720 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029683049147442327, + "loss": 2.8414, + "theoretical_loss": 3.546673860478963, + "tokens_seen": 1360864256 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029682046138415245, + "loss": 2.8909, + "theoretical_loss": 3.5466585190318747, + "tokens_seen": 1360929792 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029681043129388163, + "loss": 2.8501, + "theoretical_loss": 3.5466431785303856, + "tokens_seen": 1360995328 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029680040120361087, + "loss": 2.819, + "theoretical_loss": 3.5466278389743917, + "tokens_seen": 1361060864 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029679037111334, + "loss": 2.7727, + "theoretical_loss": 3.5466125003637896, + "tokens_seen": 1361126400 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029678034102306923, + "loss": 2.7717, + "theoretical_loss": 3.5465971626984754, + "tokens_seen": 1361191936 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002967703109327984, + "loss": 2.7172, + "theoretical_loss": 3.546581825978345, + "tokens_seen": 1361257472 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002967602808425276, + "loss": 2.8022, + "theoretical_loss": 3.546566490203295, + "tokens_seen": 1361323008 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029675025075225677, + "loss": 2.7014, + "theoretical_loss": 3.5465511553732223, + "tokens_seen": 1361388544 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029674022066198595, + "loss": 2.7876, + "theoretical_loss": 3.546535821488022, + "tokens_seen": 1361454080 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5275931358337402, + "objective/train/theoretical_loss": 3.5465319881643462, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.5465319881643462, + "tokens_seen": 1361470464 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029673019057171513, + "loss": 2.7198, + "theoretical_loss": 3.546520488547591, + "tokens_seen": 1361519616 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029672016048144437, + "loss": 2.8886, + "theoretical_loss": 3.5465051565518255, + "tokens_seen": 1361585152 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002967101303911735, + "loss": 2.8985, + "theoretical_loss": 3.546489825500622, + "tokens_seen": 1361650688 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029670010030090273, + "loss": 2.8286, + "theoretical_loss": 3.5464744953938765, + "tokens_seen": 1361716224 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029669007021063186, + "loss": 2.7767, + "theoretical_loss": 3.546459166231486, + "tokens_seen": 1361781760 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002966800401203611, + "loss": 2.6763, + "theoretical_loss": 3.546443838013346, + "tokens_seen": 1361847296 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002966700100300903, + "loss": 2.7988, + "theoretical_loss": 3.546428510739353, + "tokens_seen": 1361912832 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029665997993981946, + "loss": 2.722, + "theoretical_loss": 3.5464131844094045, + "tokens_seen": 1361978368 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029664994984954864, + "loss": 2.7535, + "theoretical_loss": 3.546397859023396, + "tokens_seen": 1362043904 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002966399197592778, + "loss": 2.8117, + "theoretical_loss": 3.546382534581224, + "tokens_seen": 1362109440 + }, + { + "epoch": 3.1, + "learning_rate": 0.000296629889669007, + "loss": 2.7353, + "theoretical_loss": 3.546367211082785, + "tokens_seen": 1362174976 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029661985957873623, + "loss": 2.9177, + "theoretical_loss": 3.546351888527976, + "tokens_seen": 1362240512 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029660982948846536, + "loss": 2.8284, + "theoretical_loss": 3.5463365669166933, + "tokens_seen": 1362306048 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965997993981946, + "loss": 2.7891, + "theoretical_loss": 3.5463212462488327, + "tokens_seen": 1362371584 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965897693079238, + "loss": 2.7751, + "theoretical_loss": 3.5463059265242913, + "tokens_seen": 1362437120 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029657973921765296, + "loss": 2.7615, + "theoretical_loss": 3.5462906077429657, + "tokens_seen": 1362502656 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029656970912738214, + "loss": 2.7552, + "theoretical_loss": 3.5462752899047523, + "tokens_seen": 1362568192 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965596790371113, + "loss": 2.8117, + "theoretical_loss": 3.546259973009548, + "tokens_seen": 1362633728 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965496489468405, + "loss": 2.8863, + "theoretical_loss": 3.5462446570572492, + "tokens_seen": 1362699264 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029653961885656974, + "loss": 2.9492, + "theoretical_loss": 3.546229342047752, + "tokens_seen": 1362764800 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965295887662989, + "loss": 2.8782, + "theoretical_loss": 3.5462140279809535, + "tokens_seen": 1362830336 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965195586760281, + "loss": 2.8082, + "theoretical_loss": 3.5461987148567506, + "tokens_seen": 1362895872 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002965095285857573, + "loss": 2.8521, + "theoretical_loss": 3.5461834026750396, + "tokens_seen": 1362961408 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029649949849548646, + "loss": 2.9922, + "theoretical_loss": 3.5461680914357174, + "tokens_seen": 1363026944 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964894684052157, + "loss": 2.8822, + "theoretical_loss": 3.54615278113868, + "tokens_seen": 1363092480 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.679535150527954, + "objective/train/theoretical_loss": 3.546148953711641, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.546148953711641, + "tokens_seen": 1363108864 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964794383149448, + "loss": 2.7309, + "theoretical_loss": 3.546137471783825, + "tokens_seen": 1363158016 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029646940822467406, + "loss": 2.8277, + "theoretical_loss": 3.546122163371049, + "tokens_seen": 1363223552 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964593781344032, + "loss": 2.7614, + "theoretical_loss": 3.5461068559002484, + "tokens_seen": 1363289088 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964493480441324, + "loss": 2.8329, + "theoretical_loss": 3.5460915493713205, + "tokens_seen": 1363354624 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964393179538616, + "loss": 2.8641, + "theoretical_loss": 3.546076243784161, + "tokens_seen": 1363420160 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964292878635908, + "loss": 2.6515, + "theoretical_loss": 3.5460609391386675, + "tokens_seen": 1363485696 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029641925777331996, + "loss": 2.7829, + "theoretical_loss": 3.546045635434737, + "tokens_seen": 1363551232 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002964092276830492, + "loss": 2.744, + "theoretical_loss": 3.5460303326722658, + "tokens_seen": 1363616768 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002963991975927783, + "loss": 2.8433, + "theoretical_loss": 3.5460150308511507, + "tokens_seen": 1363682304 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029638916750250756, + "loss": 2.7769, + "theoretical_loss": 3.545999729971289, + "tokens_seen": 1363747840 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002963791374122367, + "loss": 2.7541, + "theoretical_loss": 3.5459844300325774, + "tokens_seen": 1363813376 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002963691073219659, + "loss": 2.8097, + "theoretical_loss": 3.5459691310349126, + "tokens_seen": 1363878912 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002963590772316951, + "loss": 2.883, + "theoretical_loss": 3.5459538329781912, + "tokens_seen": 1363944448 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002963490471414243, + "loss": 3.0797, + "theoretical_loss": 3.5459385358623114, + "tokens_seen": 1364009984 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029633901705115347, + "loss": 2.8499, + "theoretical_loss": 3.545923239687169, + "tokens_seen": 1364075520 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029632898696088265, + "loss": 2.8977, + "theoretical_loss": 3.5459079444526616, + "tokens_seen": 1364141056 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029631895687061183, + "loss": 2.828, + "theoretical_loss": 3.5458926501586854, + "tokens_seen": 1364206592 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029630892678034107, + "loss": 2.7466, + "theoretical_loss": 3.5458773568051374, + "tokens_seen": 1364272128 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002962988966900702, + "loss": 2.8975, + "theoretical_loss": 3.545862064391916, + "tokens_seen": 1364337664 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029628886659979943, + "loss": 2.8046, + "theoretical_loss": 3.5458467729189165, + "tokens_seen": 1364403200 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002962788365095286, + "loss": 2.8825, + "theoretical_loss": 3.545831482386037, + "tokens_seen": 1364468736 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002962688064192578, + "loss": 2.829, + "theoretical_loss": 3.5458161927931746, + "tokens_seen": 1364534272 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029625877632898697, + "loss": 2.8629, + "theoretical_loss": 3.545800904140225, + "tokens_seen": 1364599808 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029624874623871615, + "loss": 2.7488, + "theoretical_loss": 3.545785616427087, + "tokens_seen": 1364665344 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029623871614844533, + "loss": 2.6993, + "theoretical_loss": 3.545770329653657, + "tokens_seen": 1364730880 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.73286509513855, + "objective/train/theoretical_loss": 3.545766508107117, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.545766508107117, + "tokens_seen": 1364747264 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029622868605817457, + "loss": 2.8552, + "theoretical_loss": 3.545755043819832, + "tokens_seen": 1364796416 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002962186559679037, + "loss": 2.802, + "theoretical_loss": 3.5457397589255093, + "tokens_seen": 1364861952 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029620862587763293, + "loss": 2.8232, + "theoretical_loss": 3.5457244749705854, + "tokens_seen": 1364927488 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029619859578736206, + "loss": 2.8081, + "theoretical_loss": 3.545709191954959, + "tokens_seen": 1364993024 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002961885656970913, + "loss": 2.7382, + "theoretical_loss": 3.5456939098785254, + "tokens_seen": 1365058560 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002961785356068205, + "loss": 2.756, + "theoretical_loss": 3.5456786287411832, + "tokens_seen": 1365124096 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029616850551654966, + "loss": 2.8396, + "theoretical_loss": 3.5456633485428295, + "tokens_seen": 1365189632 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029615847542627884, + "loss": 2.7368, + "theoretical_loss": 3.5456480692833603, + "tokens_seen": 1365255168 + }, + { + "epoch": 3.1, + "learning_rate": 0.000296148445336008, + "loss": 2.9011, + "theoretical_loss": 3.5456327909626744, + "tokens_seen": 1365320704 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002961384152457372, + "loss": 2.7757, + "theoretical_loss": 3.545617513580668, + "tokens_seen": 1365386240 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029612838515546643, + "loss": 2.7821, + "theoretical_loss": 3.5456022371372384, + "tokens_seen": 1365451776 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029611835506519556, + "loss": 2.824, + "theoretical_loss": 3.545586961632284, + "tokens_seen": 1365517312 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002961083249749248, + "loss": 2.8268, + "theoretical_loss": 3.545571687065701, + "tokens_seen": 1365582848 + }, + { + "epoch": 3.1, + "learning_rate": 0.000296098294884654, + "loss": 2.8949, + "theoretical_loss": 3.545556413437387, + "tokens_seen": 1365648384 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029608826479438316, + "loss": 3.0029, + "theoretical_loss": 3.5455411407472397, + "tokens_seen": 1365713920 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029607823470411234, + "loss": 2.8528, + "theoretical_loss": 3.5455258689951563, + "tokens_seen": 1365779456 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002960682046138415, + "loss": 2.7995, + "theoretical_loss": 3.5455105981810338, + "tokens_seen": 1365844992 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002960581745235707, + "loss": 2.8438, + "theoretical_loss": 3.54549532830477, + "tokens_seen": 1365910528 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029604814443329994, + "loss": 2.9235, + "theoretical_loss": 3.5454800593662616, + "tokens_seen": 1365976064 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029603811434302906, + "loss": 2.8794, + "theoretical_loss": 3.545464791365407, + "tokens_seen": 1366041600 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002960280842527583, + "loss": 2.5657, + "theoretical_loss": 3.545449524302103, + "tokens_seen": 1366107136 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002960180541624874, + "loss": 2.7975, + "theoretical_loss": 3.5454342581762477, + "tokens_seen": 1366172672 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029600802407221666, + "loss": 2.7984, + "theoretical_loss": 3.545418992987738, + "tokens_seen": 1366238208 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029599799398194584, + "loss": 2.8619, + "theoretical_loss": 3.5454037287364715, + "tokens_seen": 1366303744 + }, + { + "epoch": 3.1, + "learning_rate": 0.000295987963891675, + "loss": 2.8157, + "theoretical_loss": 3.545388465422345, + "tokens_seen": 1366369280 + }, + { + "epoch": 3.1, + "objective/train/docs_used": 2151444, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8814573287963867, + "objective/train/theoretical_loss": 3.54538464974023, + "objective/train/tokens_used": 1366811104, + "theoretical_loss": 3.54538464974023, + "tokens_seen": 1366385664 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002959779338014042, + "loss": 2.9289, + "theoretical_loss": 3.5453732030452576, + "tokens_seen": 1366434816 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002959679037111334, + "loss": 2.8185, + "theoretical_loss": 3.545357941605106, + "tokens_seen": 1366500352 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029595787362086257, + "loss": 2.7213, + "theoretical_loss": 3.545342681101787, + "tokens_seen": 1366565888 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002959478435305918, + "loss": 2.8212, + "theoretical_loss": 3.5453274215352, + "tokens_seen": 1366631424 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029593781344032093, + "loss": 2.7575, + "theoretical_loss": 3.5453121629052413, + "tokens_seen": 1366696960 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029592778335005016, + "loss": 2.8523, + "theoretical_loss": 3.545296905211808, + "tokens_seen": 1366762496 + }, + { + "epoch": 3.1, + "learning_rate": 0.00029591775325977935, + "loss": 2.9042, + "theoretical_loss": 3.545281648454799, + "tokens_seen": 1366828032 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029590772316950853, + "loss": 3.4163, + "theoretical_loss": 3.5452656775404883, + "tokens_seen": 1366896640 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958976930792377, + "loss": 2.638, + "theoretical_loss": 3.545250422699902, + "tokens_seen": 1366962176 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958876629889669, + "loss": 3.015, + "theoretical_loss": 3.5452351687954278, + "tokens_seen": 1367027712 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029587763289869607, + "loss": 2.9003, + "theoretical_loss": 3.545219915826963, + "tokens_seen": 1367093248 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958676028084253, + "loss": 2.9438, + "theoretical_loss": 3.5452046637944057, + "tokens_seen": 1367158784 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029585757271815443, + "loss": 2.7556, + "theoretical_loss": 3.545189412697653, + "tokens_seen": 1367224320 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029584754262788367, + "loss": 2.8933, + "theoretical_loss": 3.5451741625366036, + "tokens_seen": 1367289856 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958375125376128, + "loss": 2.8706, + "theoretical_loss": 3.545158913311154, + "tokens_seen": 1367355392 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029582748244734203, + "loss": 2.8081, + "theoretical_loss": 3.545143665021203, + "tokens_seen": 1367420928 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958174523570712, + "loss": 2.8281, + "theoretical_loss": 3.5451284176666475, + "tokens_seen": 1367486464 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002958074222668004, + "loss": 2.8137, + "theoretical_loss": 3.5451131712473862, + "tokens_seen": 1367552000 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002957973921765296, + "loss": 2.7144, + "theoretical_loss": 3.5450979257633164, + "tokens_seen": 1367617536 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002957873620862588, + "loss": 2.7357, + "theoretical_loss": 3.5450826812143363, + "tokens_seen": 1367683072 + }, + { + "epoch": 4.0, + "learning_rate": 0.000295777331995988, + "loss": 2.7378, + "theoretical_loss": 3.545067437600343, + "tokens_seen": 1367748608 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029576730190571717, + "loss": 2.7425, + "theoretical_loss": 3.545052194921235, + "tokens_seen": 1367814144 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029575727181544635, + "loss": 2.8226, + "theoretical_loss": 3.54503695317691, + "tokens_seen": 1367879680 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029574724172517553, + "loss": 2.7713, + "theoretical_loss": 3.545021712367266, + "tokens_seen": 1367945216 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2186216, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.200186252593994, + "objective/train/theoretical_loss": 3.5450064724922004, + "objective/train/tokens_used": 1388470752, + "theoretical_loss": 3.5450064724922004, + "tokens_seen": 1368010752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029573721163490477, + "loss": 2.8465, + "theoretical_loss": 3.5450064724922004, + "tokens_seen": 1368010752 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002957271815446339, + "loss": 2.7067, + "theoretical_loss": 3.5449912335516114, + "tokens_seen": 1368076288 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029571715145436313, + "loss": 2.8618, + "theoretical_loss": 3.544975995545397, + "tokens_seen": 1368141824 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029570712136409226, + "loss": 2.7812, + "theoretical_loss": 3.544960758473456, + "tokens_seen": 1368207360 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002956970912738215, + "loss": 2.9685, + "theoretical_loss": 3.544945522335685, + "tokens_seen": 1368272896 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002956870611835507, + "loss": 2.7899, + "theoretical_loss": 3.544930287131983, + "tokens_seen": 1368338432 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029567703109327986, + "loss": 2.9021, + "theoretical_loss": 3.5449150528622475, + "tokens_seen": 1368403968 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029566700100300904, + "loss": 2.7908, + "theoretical_loss": 3.544899819526376, + "tokens_seen": 1368469504 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002956569709127382, + "loss": 2.9422, + "theoretical_loss": 3.5448845871242676, + "tokens_seen": 1368535040 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002956469408224674, + "loss": 2.9105, + "theoretical_loss": 3.5448693556558197, + "tokens_seen": 1368600576 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029563691073219663, + "loss": 2.6355, + "theoretical_loss": 3.5448541251209305, + "tokens_seen": 1368666112 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029562688064192576, + "loss": 2.8317, + "theoretical_loss": 3.544838895519498, + "tokens_seen": 1368731648 + }, + { + "epoch": 4.0, + "learning_rate": 0.000295616850551655, + "loss": 2.9471, + "theoretical_loss": 3.544823666851421, + "tokens_seen": 1368797184 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002956068204613842, + "loss": 2.8695, + "theoretical_loss": 3.544808439116596, + "tokens_seen": 1368862720 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029559679037111336, + "loss": 2.7961, + "theoretical_loss": 3.5447932123149233, + "tokens_seen": 1368928256 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029558676028084254, + "loss": 2.8205, + "theoretical_loss": 3.5447779864462996, + "tokens_seen": 1368993792 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002955767301905717, + "loss": 2.8594, + "theoretical_loss": 3.544762761510623, + "tokens_seen": 1369059328 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002955667001003009, + "loss": 2.8219, + "theoretical_loss": 3.544747537507792, + "tokens_seen": 1369124864 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029555667001003014, + "loss": 2.7788, + "theoretical_loss": 3.544732314437705, + "tokens_seen": 1369190400 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029554663991975926, + "loss": 2.801, + "theoretical_loss": 3.54471709230026, + "tokens_seen": 1369255936 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002955366098294885, + "loss": 2.7672, + "theoretical_loss": 3.5447018710953557, + "tokens_seen": 1369321472 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002955265797392176, + "loss": 2.9065, + "theoretical_loss": 3.5446866508228903, + "tokens_seen": 1369387008 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029551654964894686, + "loss": 2.8583, + "theoretical_loss": 3.5446714314827608, + "tokens_seen": 1369452544 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029550651955867604, + "loss": 2.7987, + "theoretical_loss": 3.5446562130748664, + "tokens_seen": 1369518080 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002954964894684052, + "loss": 2.9971, + "theoretical_loss": 3.5446409955991056, + "tokens_seen": 1369583616 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2189251, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8702282905578613, + "objective/train/theoretical_loss": 3.5446257790553766, + "objective/train/tokens_used": 1390109152, + "theoretical_loss": 3.5446257790553766, + "tokens_seen": 1369649152 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002954864593781344, + "loss": 2.9711, + "theoretical_loss": 3.5446257790553766, + "tokens_seen": 1369649152 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002954764292878636, + "loss": 2.915, + "theoretical_loss": 3.5446105634435776, + "tokens_seen": 1369714688 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029546639919759277, + "loss": 2.74, + "theoretical_loss": 3.5445953487636066, + "tokens_seen": 1369780224 + }, + { + "epoch": 4.0, + "learning_rate": 0.000295456369107322, + "loss": 2.9206, + "theoretical_loss": 3.5445801350153623, + "tokens_seen": 1369845760 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029544633901705113, + "loss": 2.8366, + "theoretical_loss": 3.5445649221987434, + "tokens_seen": 1369911296 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029543630892678037, + "loss": 2.8615, + "theoretical_loss": 3.544549710313648, + "tokens_seen": 1369976832 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029542627883650955, + "loss": 2.7726, + "theoretical_loss": 3.544534499359974, + "tokens_seen": 1370042368 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029541624874623873, + "loss": 2.772, + "theoretical_loss": 3.5445192893376203, + "tokens_seen": 1370107904 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002954062186559679, + "loss": 2.7794, + "theoretical_loss": 3.5445040802464858, + "tokens_seen": 1370173440 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002953961885656971, + "loss": 2.7957, + "theoretical_loss": 3.544488872086468, + "tokens_seen": 1370238976 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029538615847542627, + "loss": 2.8102, + "theoretical_loss": 3.544473664857466, + "tokens_seen": 1370304512 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002953761283851555, + "loss": 2.756, + "theoretical_loss": 3.5444584585593777, + "tokens_seen": 1370370048 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029536609829488463, + "loss": 2.9152, + "theoretical_loss": 3.544443253192102, + "tokens_seen": 1370435584 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029535606820461387, + "loss": 2.698, + "theoretical_loss": 3.544428048755538, + "tokens_seen": 1370501120 + }, + { + "epoch": 4.0, + "learning_rate": 0.000295346038114343, + "loss": 2.9318, + "theoretical_loss": 3.5444128452495836, + "tokens_seen": 1370566656 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029533600802407223, + "loss": 2.8973, + "theoretical_loss": 3.544397642674137, + "tokens_seen": 1370632192 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002953259779338014, + "loss": 2.7587, + "theoretical_loss": 3.5443824410290974, + "tokens_seen": 1370697728 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002953159478435306, + "loss": 2.8129, + "theoretical_loss": 3.544367240314363, + "tokens_seen": 1370763264 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002953059177532598, + "loss": 2.9239, + "theoretical_loss": 3.5443520405298323, + "tokens_seen": 1370828800 + }, + { + "epoch": 4.0, + "learning_rate": 0.000295295887662989, + "loss": 2.91, + "theoretical_loss": 3.5443368416754044, + "tokens_seen": 1370894336 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029528585757271814, + "loss": 2.7158, + "theoretical_loss": 3.5443216437509775, + "tokens_seen": 1370959872 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029527582748244737, + "loss": 2.8423, + "theoretical_loss": 3.5443064467564502, + "tokens_seen": 1371025408 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002952657973921765, + "loss": 2.8879, + "theoretical_loss": 3.544291250691722, + "tokens_seen": 1371090944 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029525576730190573, + "loss": 2.9974, + "theoretical_loss": 3.544276055556691, + "tokens_seen": 1371156480 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002952457372116349, + "loss": 2.8609, + "theoretical_loss": 3.5442608613512547, + "tokens_seen": 1371222016 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2192065, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2302443981170654, + "objective/train/theoretical_loss": 3.5442456680753134, + "objective/train/tokens_used": 1391747552, + "theoretical_loss": 3.5442456680753134, + "tokens_seen": 1371287552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002952357071213641, + "loss": 3.0136, + "theoretical_loss": 3.5442456680753134, + "tokens_seen": 1371287552 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002952256770310933, + "loss": 2.8214, + "theoretical_loss": 3.544230475728766, + "tokens_seen": 1371353088 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029521564694082246, + "loss": 2.9582, + "theoretical_loss": 3.54421528431151, + "tokens_seen": 1371418624 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029520561685055164, + "loss": 2.7328, + "theoretical_loss": 3.544200093823445, + "tokens_seen": 1371484160 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002951955867602809, + "loss": 2.9478, + "theoretical_loss": 3.544184904264469, + "tokens_seen": 1371549696 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029518555667001, + "loss": 2.7967, + "theoretical_loss": 3.5441697156344816, + "tokens_seen": 1371615232 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029517552657973924, + "loss": 2.7777, + "theoretical_loss": 3.5441545279333813, + "tokens_seen": 1371680768 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029516549648946836, + "loss": 3.0316, + "theoretical_loss": 3.544139341161067, + "tokens_seen": 1371746304 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002951554663991976, + "loss": 2.8281, + "theoretical_loss": 3.544124155317437, + "tokens_seen": 1371811840 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002951454363089268, + "loss": 2.9216, + "theoretical_loss": 3.544108970402391, + "tokens_seen": 1371877376 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029513540621865596, + "loss": 2.8902, + "theoretical_loss": 3.5440937864158277, + "tokens_seen": 1371942912 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029512537612838514, + "loss": 2.7692, + "theoretical_loss": 3.5440786033576455, + "tokens_seen": 1372008448 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002951153460381144, + "loss": 2.6688, + "theoretical_loss": 3.544063421227743, + "tokens_seen": 1372073984 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002951053159478435, + "loss": 2.8875, + "theoretical_loss": 3.54404824002602, + "tokens_seen": 1372139520 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029509528585757274, + "loss": 2.7835, + "theoretical_loss": 3.5440330597523753, + "tokens_seen": 1372205056 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029508525576730187, + "loss": 2.7829, + "theoretical_loss": 3.5440178804067073, + "tokens_seen": 1372270592 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002950752256770311, + "loss": 2.9332, + "theoretical_loss": 3.5440027019889153, + "tokens_seen": 1372336128 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002950651955867603, + "loss": 2.8124, + "theoretical_loss": 3.543987524498898, + "tokens_seen": 1372401664 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029505516549648946, + "loss": 2.7698, + "theoretical_loss": 3.5439723479365552, + "tokens_seen": 1372467200 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029504513540621865, + "loss": 2.8276, + "theoretical_loss": 3.543957172301785, + "tokens_seen": 1372532736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002950351053159478, + "loss": 2.8185, + "theoretical_loss": 3.5439419975944864, + "tokens_seen": 1372598272 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029502507522567706, + "loss": 2.7935, + "theoretical_loss": 3.5439268238145596, + "tokens_seen": 1372663808 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029501504513540624, + "loss": 2.9048, + "theoretical_loss": 3.5439116509619017, + "tokens_seen": 1372729344 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002950050150451354, + "loss": 2.7506, + "theoretical_loss": 3.543896479036414, + "tokens_seen": 1372794880 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002949949849548646, + "loss": 3.007, + "theoretical_loss": 3.543881308037994, + "tokens_seen": 1372860416 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2194290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.99957275390625, + "objective/train/theoretical_loss": 3.5438661379665413, + "objective/train/tokens_used": 1393385952, + "theoretical_loss": 3.5438661379665413, + "tokens_seen": 1372925952 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002949849548645938, + "loss": 2.9309, + "theoretical_loss": 3.5438661379665413, + "tokens_seen": 1372925952 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029497492477432297, + "loss": 2.8793, + "theoretical_loss": 3.543850968821955, + "tokens_seen": 1372991488 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002949648946840522, + "loss": 2.8315, + "theoretical_loss": 3.5438358006041346, + "tokens_seen": 1373057024 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029495486459378133, + "loss": 2.7857, + "theoretical_loss": 3.5438206333129783, + "tokens_seen": 1373122560 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029494483450351057, + "loss": 2.8227, + "theoretical_loss": 3.5438054669483865, + "tokens_seen": 1373188096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029493480441323975, + "loss": 2.7414, + "theoretical_loss": 3.543790301510257, + "tokens_seen": 1373253632 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029492477432296893, + "loss": 2.7569, + "theoretical_loss": 3.5437751369984904, + "tokens_seen": 1373319168 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002949147442326981, + "loss": 2.865, + "theoretical_loss": 3.543759973412985, + "tokens_seen": 1373384704 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002949047141424273, + "loss": 2.9163, + "theoretical_loss": 3.5437448107536405, + "tokens_seen": 1373450240 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029489468405215647, + "loss": 2.9666, + "theoretical_loss": 3.5437296490203556, + "tokens_seen": 1373515776 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002948846539618857, + "loss": 2.8667, + "theoretical_loss": 3.5437144882130296, + "tokens_seen": 1373581312 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029487462387161483, + "loss": 2.8078, + "theoretical_loss": 3.5436993283315625, + "tokens_seen": 1373646848 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029486459378134407, + "loss": 2.8702, + "theoretical_loss": 3.543684169375853, + "tokens_seen": 1373712384 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002948545636910732, + "loss": 2.8007, + "theoretical_loss": 3.5436690113458007, + "tokens_seen": 1373777920 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029484453360080243, + "loss": 2.7263, + "theoretical_loss": 3.5436538542413047, + "tokens_seen": 1373843456 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002948345035105316, + "loss": 2.9302, + "theoretical_loss": 3.5436386980622645, + "tokens_seen": 1373908992 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002948244734202608, + "loss": 2.8234, + "theoretical_loss": 3.543623542808579, + "tokens_seen": 1373974528 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029481444332999, + "loss": 2.9589, + "theoretical_loss": 3.5436083884801484, + "tokens_seen": 1374040064 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002948044132397192, + "loss": 2.9071, + "theoretical_loss": 3.5435932350768713, + "tokens_seen": 1374105600 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029479438314944834, + "loss": 2.8604, + "theoretical_loss": 3.543578082598647, + "tokens_seen": 1374171136 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029478435305917757, + "loss": 2.7213, + "theoretical_loss": 3.543562931045376, + "tokens_seen": 1374236672 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002947743229689067, + "loss": 2.935, + "theoretical_loss": 3.543547780416957, + "tokens_seen": 1374302208 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029476429287863593, + "loss": 2.7984, + "theoretical_loss": 3.5435326307132895, + "tokens_seen": 1374367744 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002947542627883651, + "loss": 2.7962, + "theoretical_loss": 3.5435174819342725, + "tokens_seen": 1374433280 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002947442326980943, + "loss": 2.9648, + "theoretical_loss": 3.5435023340798066, + "tokens_seen": 1374498816 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2197147, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.887049913406372, + "objective/train/theoretical_loss": 3.54348718714979, + "objective/train/tokens_used": 1395024352, + "theoretical_loss": 3.54348718714979, + "tokens_seen": 1374564352 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002947342026078235, + "loss": 2.8272, + "theoretical_loss": 3.54348718714979, + "tokens_seen": 1374564352 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029472417251755266, + "loss": 2.8351, + "theoretical_loss": 3.543472041144123, + "tokens_seen": 1374629888 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029471414242728184, + "loss": 2.8871, + "theoretical_loss": 3.5434568960627058, + "tokens_seen": 1374695424 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002947041123370111, + "loss": 3.0056, + "theoretical_loss": 3.543441751905436, + "tokens_seen": 1374760960 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002946940822467402, + "loss": 2.738, + "theoretical_loss": 3.543426608672215, + "tokens_seen": 1374826496 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029468405215646944, + "loss": 2.6317, + "theoretical_loss": 3.543411466362942, + "tokens_seen": 1374892032 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029467402206619856, + "loss": 2.6431, + "theoretical_loss": 3.5433963249775156, + "tokens_seen": 1374957568 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002946639919759278, + "loss": 2.9032, + "theoretical_loss": 3.5433811845158356, + "tokens_seen": 1375023104 + }, + { + "epoch": 4.0, + "learning_rate": 0.000294653961885657, + "loss": 2.7551, + "theoretical_loss": 3.5433660449778026, + "tokens_seen": 1375088640 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029464393179538616, + "loss": 2.7051, + "theoretical_loss": 3.543350906363316, + "tokens_seen": 1375154176 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029463390170511534, + "loss": 2.7103, + "theoretical_loss": 3.543335768672275, + "tokens_seen": 1375219712 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002946238716148446, + "loss": 2.8609, + "theoretical_loss": 3.5433206319045794, + "tokens_seen": 1375285248 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002946138415245737, + "loss": 2.9021, + "theoretical_loss": 3.5433054960601287, + "tokens_seen": 1375350784 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029460381143430294, + "loss": 2.9495, + "theoretical_loss": 3.5432903611388227, + "tokens_seen": 1375416320 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029459378134403207, + "loss": 2.8771, + "theoretical_loss": 3.543275227140562, + "tokens_seen": 1375481856 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002945837512537613, + "loss": 2.8032, + "theoretical_loss": 3.5432600940652446, + "tokens_seen": 1375547392 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002945737211634905, + "loss": 2.9729, + "theoretical_loss": 3.5432449619127717, + "tokens_seen": 1375612928 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029456369107321966, + "loss": 2.7201, + "theoretical_loss": 3.5432298306830425, + "tokens_seen": 1375678464 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029455366098294885, + "loss": 2.8778, + "theoretical_loss": 3.543214700375957, + "tokens_seen": 1375744000 + }, + { + "epoch": 4.0, + "learning_rate": 0.000294543630892678, + "loss": 2.6035, + "theoretical_loss": 3.5431995709914146, + "tokens_seen": 1375809536 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002945336008024072, + "loss": 2.7622, + "theoretical_loss": 3.5431844425293155, + "tokens_seen": 1375875072 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029452357071213644, + "loss": 2.9587, + "theoretical_loss": 3.543169314989559, + "tokens_seen": 1375940608 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029451354062186557, + "loss": 2.9372, + "theoretical_loss": 3.543154188372046, + "tokens_seen": 1376006144 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002945035105315948, + "loss": 2.7241, + "theoretical_loss": 3.543139062676675, + "tokens_seen": 1376071680 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029449348044132393, + "loss": 2.851, + "theoretical_loss": 3.5431239379033466, + "tokens_seen": 1376137216 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2200122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.675403356552124, + "objective/train/theoretical_loss": 3.543108814051961, + "objective/train/tokens_used": 1396662752, + "theoretical_loss": 3.543108814051961, + "tokens_seen": 1376202752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029448345035105317, + "loss": 2.8537, + "theoretical_loss": 3.543108814051961, + "tokens_seen": 1376202752 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029447342026078235, + "loss": 2.8773, + "theoretical_loss": 3.543093691122418, + "tokens_seen": 1376268288 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029446339017051153, + "loss": 2.8096, + "theoretical_loss": 3.5430785691146163, + "tokens_seen": 1376333824 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002944533600802407, + "loss": 2.802, + "theoretical_loss": 3.543063448028457, + "tokens_seen": 1376399360 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029444332998996995, + "loss": 2.8488, + "theoretical_loss": 3.5430483278638403, + "tokens_seen": 1376464896 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002944332998996991, + "loss": 2.8571, + "theoretical_loss": 3.5430332086206655, + "tokens_seen": 1376530432 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002944232698094283, + "loss": 2.7365, + "theoretical_loss": 3.543018090298833, + "tokens_seen": 1376595968 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029441323971915744, + "loss": 2.8363, + "theoretical_loss": 3.543002972898242, + "tokens_seen": 1376661504 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029440320962888667, + "loss": 2.9183, + "theoretical_loss": 3.5429878564187933, + "tokens_seen": 1376727040 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029439317953861585, + "loss": 2.8663, + "theoretical_loss": 3.542972740860387, + "tokens_seen": 1376792576 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029438314944834503, + "loss": 2.7508, + "theoretical_loss": 3.5429576262229228, + "tokens_seen": 1376858112 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943731193580742, + "loss": 2.8202, + "theoretical_loss": 3.5429425125063005, + "tokens_seen": 1376923648 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943630892678034, + "loss": 2.906, + "theoretical_loss": 3.5429273997104205, + "tokens_seen": 1376989184 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943530591775326, + "loss": 2.8288, + "theoretical_loss": 3.542912287835183, + "tokens_seen": 1377054720 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943430290872618, + "loss": 2.6367, + "theoretical_loss": 3.542897176880488, + "tokens_seen": 1377120256 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029433299899699094, + "loss": 2.9079, + "theoretical_loss": 3.5428820668462357, + "tokens_seen": 1377185792 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943229689067202, + "loss": 2.7805, + "theoretical_loss": 3.542866957732326, + "tokens_seen": 1377251328 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002943129388164493, + "loss": 2.7937, + "theoretical_loss": 3.542851849538659, + "tokens_seen": 1377316864 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029430290872617854, + "loss": 2.7523, + "theoretical_loss": 3.5428367422651355, + "tokens_seen": 1377382400 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029429287863590777, + "loss": 2.8594, + "theoretical_loss": 3.542821635911655, + "tokens_seen": 1377447936 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942828485456369, + "loss": 3.0086, + "theoretical_loss": 3.542806530478118, + "tokens_seen": 1377513472 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029427281845536613, + "loss": 2.8201, + "theoretical_loss": 3.542791425964424, + "tokens_seen": 1377579008 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942627883650953, + "loss": 2.8495, + "theoretical_loss": 3.542776322370475, + "tokens_seen": 1377644544 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942527582748245, + "loss": 2.7752, + "theoretical_loss": 3.5427612196961693, + "tokens_seen": 1377710080 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942427281845537, + "loss": 2.789, + "theoretical_loss": 3.5427461179414084, + "tokens_seen": 1377775616 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2203056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713243246078491, + "objective/train/theoretical_loss": 3.5427310171060924, + "objective/train/tokens_used": 1398301152, + "theoretical_loss": 3.5427310171060924, + "tokens_seen": 1377841152 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029423269809428286, + "loss": 2.8925, + "theoretical_loss": 3.5427310171060924, + "tokens_seen": 1377841152 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029422266800401204, + "loss": 2.7366, + "theoretical_loss": 3.5427159171901206, + "tokens_seen": 1377906688 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942126379137413, + "loss": 2.8579, + "theoretical_loss": 3.5427008181933948, + "tokens_seen": 1377972224 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002942026078234704, + "loss": 2.8448, + "theoretical_loss": 3.542685720115814, + "tokens_seen": 1378037760 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029419257773319964, + "loss": 2.881, + "theoretical_loss": 3.5426706229572797, + "tokens_seen": 1378103296 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029418254764292876, + "loss": 3.0273, + "theoretical_loss": 3.5426555267176916, + "tokens_seen": 1378168832 + }, + { + "epoch": 4.0, + "learning_rate": 0.000294172517552658, + "loss": 2.8155, + "theoretical_loss": 3.5426404313969497, + "tokens_seen": 1378234368 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002941624874623872, + "loss": 2.8786, + "theoretical_loss": 3.542625336994955, + "tokens_seen": 1378299904 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029415245737211636, + "loss": 2.9619, + "theoretical_loss": 3.542610243511608, + "tokens_seen": 1378365440 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029414242728184554, + "loss": 2.9568, + "theoretical_loss": 3.5425951509468088, + "tokens_seen": 1378430976 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002941323971915748, + "loss": 2.7566, + "theoretical_loss": 3.542580059300458, + "tokens_seen": 1378496512 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002941223671013039, + "loss": 2.7873, + "theoretical_loss": 3.5425649685724556, + "tokens_seen": 1378562048 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029411233701103314, + "loss": 2.7661, + "theoretical_loss": 3.5425498787627023, + "tokens_seen": 1378627584 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029410230692076227, + "loss": 2.8506, + "theoretical_loss": 3.542534789871099, + "tokens_seen": 1378693120 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002940922768304915, + "loss": 2.881, + "theoretical_loss": 3.542519701897546, + "tokens_seen": 1378758656 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002940822467402207, + "loss": 2.7747, + "theoretical_loss": 3.5425046148419432, + "tokens_seen": 1378824192 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029407221664994986, + "loss": 2.86, + "theoretical_loss": 3.542489528704192, + "tokens_seen": 1378889728 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029406218655967905, + "loss": 2.6413, + "theoretical_loss": 3.5424744434841924, + "tokens_seen": 1378955264 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002940521564694082, + "loss": 2.6286, + "theoretical_loss": 3.542459359181845, + "tokens_seen": 1379020800 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002940421263791374, + "loss": 2.9051, + "theoretical_loss": 3.5424442757970507, + "tokens_seen": 1379086336 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029403209628886664, + "loss": 2.7481, + "theoretical_loss": 3.54242919332971, + "tokens_seen": 1379151872 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029402206619859577, + "loss": 2.7181, + "theoretical_loss": 3.542414111779723, + "tokens_seen": 1379217408 + }, + { + "epoch": 4.0, + "learning_rate": 0.000294012036108325, + "loss": 2.7493, + "theoretical_loss": 3.542399031146991, + "tokens_seen": 1379282944 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029400200601805413, + "loss": 2.7605, + "theoretical_loss": 3.542383951431414, + "tokens_seen": 1379348480 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029399197592778337, + "loss": 2.7902, + "theoretical_loss": 3.5423688726328932, + "tokens_seen": 1379414016 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2205936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7096846103668213, + "objective/train/theoretical_loss": 3.542353794751329, + "objective/train/tokens_used": 1399939552, + "theoretical_loss": 3.542353794751329, + "tokens_seen": 1379479552 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029398194583751255, + "loss": 2.957, + "theoretical_loss": 3.542353794751329, + "tokens_seen": 1379479552 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029397191574724173, + "loss": 2.9699, + "theoretical_loss": 3.542338717786622, + "tokens_seen": 1379545088 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002939618856569709, + "loss": 2.8626, + "theoretical_loss": 3.5423236417386734, + "tokens_seen": 1379610624 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029395185556670015, + "loss": 2.6946, + "theoretical_loss": 3.5423085666073835, + "tokens_seen": 1379676160 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002939418254764293, + "loss": 2.9152, + "theoretical_loss": 3.542293492392653, + "tokens_seen": 1379741696 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002939317953861585, + "loss": 2.904, + "theoretical_loss": 3.5422784190943823, + "tokens_seen": 1379807232 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029392176529588764, + "loss": 2.8256, + "theoretical_loss": 3.5422633467124727, + "tokens_seen": 1379872768 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029391173520561687, + "loss": 2.9613, + "theoretical_loss": 3.542248275246825, + "tokens_seen": 1379938304 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029390170511534605, + "loss": 2.6548, + "theoretical_loss": 3.5422332046973395, + "tokens_seen": 1380003840 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029389167502507523, + "loss": 2.7603, + "theoretical_loss": 3.5422181350639175, + "tokens_seen": 1380069376 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938816449348044, + "loss": 2.9326, + "theoretical_loss": 3.5422030663464597, + "tokens_seen": 1380134912 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938716148445336, + "loss": 2.9293, + "theoretical_loss": 3.542187998544867, + "tokens_seen": 1380200448 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938615847542628, + "loss": 2.8895, + "theoretical_loss": 3.54217293165904, + "tokens_seen": 1380265984 + }, + { + "epoch": 4.0, + "learning_rate": 0.000293851554663992, + "loss": 2.7993, + "theoretical_loss": 3.5421578656888797, + "tokens_seen": 1380331520 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029384152457372114, + "loss": 2.9319, + "theoretical_loss": 3.542142800634287, + "tokens_seen": 1380397056 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938314944834504, + "loss": 2.8954, + "theoretical_loss": 3.5421277364951624, + "tokens_seen": 1380462592 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938214643931795, + "loss": 2.9438, + "theoretical_loss": 3.542112673271408, + "tokens_seen": 1380528128 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029381143430290874, + "loss": 2.9001, + "theoretical_loss": 3.542097610962923, + "tokens_seen": 1380593664 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002938014042126379, + "loss": 2.7996, + "theoretical_loss": 3.54208254956961, + "tokens_seen": 1380659200 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937913741223671, + "loss": 2.9091, + "theoretical_loss": 3.5420674890913686, + "tokens_seen": 1380724736 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937813440320963, + "loss": 2.7606, + "theoretical_loss": 3.5420524295281006, + "tokens_seen": 1380790272 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937713139418255, + "loss": 2.8346, + "theoretical_loss": 3.542037370879707, + "tokens_seen": 1380855808 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029376128385155464, + "loss": 2.7432, + "theoretical_loss": 3.542022313146088, + "tokens_seen": 1380921344 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937512537612839, + "loss": 2.7843, + "theoretical_loss": 3.542007256327146, + "tokens_seen": 1380986880 + }, + { + "epoch": 4.0, + "learning_rate": 0.000293741223671013, + "loss": 2.9943, + "theoretical_loss": 3.54199220042278, + "tokens_seen": 1381052416 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2207757, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.983628511428833, + "objective/train/theoretical_loss": 3.5419771454328934, + "objective/train/tokens_used": 1401577952, + "theoretical_loss": 3.5419771454328934, + "tokens_seen": 1381117952 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029373119358074224, + "loss": 2.9149, + "theoretical_loss": 3.5419771454328934, + "tokens_seen": 1381117952 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937211634904714, + "loss": 2.7664, + "theoretical_loss": 3.5419620913573864, + "tokens_seen": 1381183488 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937111334002006, + "loss": 2.7565, + "theoretical_loss": 3.541947038196159, + "tokens_seen": 1381249024 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002937011033099298, + "loss": 2.888, + "theoretical_loss": 3.5419319859491134, + "tokens_seen": 1381314560 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029369107321965896, + "loss": 2.7794, + "theoretical_loss": 3.5419169346161503, + "tokens_seen": 1381380096 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029368104312938815, + "loss": 2.8869, + "theoretical_loss": 3.5419018841971712, + "tokens_seen": 1381445632 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002936710130391174, + "loss": 2.8033, + "theoretical_loss": 3.541886834692077, + "tokens_seen": 1381511168 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002936609829488465, + "loss": 2.7142, + "theoretical_loss": 3.541871786100769, + "tokens_seen": 1381576704 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029365095285857574, + "loss": 2.7069, + "theoretical_loss": 3.5418567384231476, + "tokens_seen": 1381642240 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029364092276830487, + "loss": 2.6818, + "theoretical_loss": 3.541841691659115, + "tokens_seen": 1381707776 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002936308926780341, + "loss": 2.6897, + "theoretical_loss": 3.5418266458085723, + "tokens_seen": 1381773312 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002936208625877633, + "loss": 2.805, + "theoretical_loss": 3.54181160087142, + "tokens_seen": 1381838848 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029361083249749247, + "loss": 2.9161, + "theoretical_loss": 3.5417965568475607, + "tokens_seen": 1381904384 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029360080240722165, + "loss": 2.9071, + "theoretical_loss": 3.5417815137368938, + "tokens_seen": 1381969920 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002935907723169509, + "loss": 2.7328, + "theoretical_loss": 3.5417664715393222, + "tokens_seen": 1382035456 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029358074222668, + "loss": 2.7892, + "theoretical_loss": 3.5417514302547457, + "tokens_seen": 1382100992 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029357071213640925, + "loss": 2.8552, + "theoretical_loss": 3.541736389883067, + "tokens_seen": 1382166528 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002935606820461384, + "loss": 2.5836, + "theoretical_loss": 3.5417213504241865, + "tokens_seen": 1382232064 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002935506519558676, + "loss": 2.9871, + "theoretical_loss": 3.5417063118780057, + "tokens_seen": 1382297600 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029354062186559684, + "loss": 2.828, + "theoretical_loss": 3.5416912742444264, + "tokens_seen": 1382363136 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029353059177532597, + "loss": 2.5844, + "theoretical_loss": 3.5416762375233497, + "tokens_seen": 1382428672 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002935205616850552, + "loss": 2.8448, + "theoretical_loss": 3.5416612017146765, + "tokens_seen": 1382494208 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029351053159478433, + "loss": 2.8521, + "theoretical_loss": 3.541646166818309, + "tokens_seen": 1382559744 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029350050150451357, + "loss": 2.7912, + "theoretical_loss": 3.541631132834148, + "tokens_seen": 1382625280 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029349047141424275, + "loss": 2.7844, + "theoretical_loss": 3.5416160997620953, + "tokens_seen": 1382690816 + }, + { + "epoch": 4.0, + "objective/train/docs_used": 2210638, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.73905873298645, + "objective/train/theoretical_loss": 3.541601067602052, + "objective/train/tokens_used": 1403216352, + "theoretical_loss": 3.541601067602052, + "tokens_seen": 1382756352 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029348044132397193, + "loss": 2.7986, + "theoretical_loss": 3.541601067602052, + "tokens_seen": 1382756352 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002934704112337011, + "loss": 2.8013, + "theoretical_loss": 3.5415860363539196, + "tokens_seen": 1382821888 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029346038114343035, + "loss": 2.8228, + "theoretical_loss": 3.5415710060176, + "tokens_seen": 1382887424 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002934503510531595, + "loss": 2.8105, + "theoretical_loss": 3.541555976592994, + "tokens_seen": 1382952960 + }, + { + "epoch": 4.0, + "learning_rate": 0.0002934403209628887, + "loss": 2.6833, + "theoretical_loss": 3.5415409480800033, + "tokens_seen": 1383018496 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029343029087261784, + "loss": 2.9188, + "theoretical_loss": 3.54152592047853, + "tokens_seen": 1383084032 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029342026078234707, + "loss": 2.7703, + "theoretical_loss": 3.541510893788475, + "tokens_seen": 1383149568 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029341023069207625, + "loss": 2.8382, + "theoretical_loss": 3.54149586800974, + "tokens_seen": 1383215104 + }, + { + "epoch": 4.0, + "learning_rate": 0.00029340020060180543, + "loss": 2.8014, + "theoretical_loss": 3.541480843142227, + "tokens_seen": 1383280640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933901705115346, + "loss": 2.7927, + "theoretical_loss": 3.5414658191858366, + "tokens_seen": 1383346176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933801404212638, + "loss": 2.9512, + "theoretical_loss": 3.541450796140471, + "tokens_seen": 1383411712 + }, + { + "epoch": 4.01, + "learning_rate": 0.000293370110330993, + "loss": 2.8056, + "theoretical_loss": 3.541435774006032, + "tokens_seen": 1383477248 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933600802407222, + "loss": 2.7987, + "theoretical_loss": 3.541420752782421, + "tokens_seen": 1383542784 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029335005015045134, + "loss": 2.8237, + "theoretical_loss": 3.5414057324695394, + "tokens_seen": 1383608320 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933400200601806, + "loss": 2.9686, + "theoretical_loss": 3.5413907130672886, + "tokens_seen": 1383673856 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933299899699097, + "loss": 2.9078, + "theoretical_loss": 3.5413756945755717, + "tokens_seen": 1383739392 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029331995987963894, + "loss": 2.8337, + "theoretical_loss": 3.5413606769942887, + "tokens_seen": 1383804928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002933099297893681, + "loss": 2.8114, + "theoretical_loss": 3.5413456603233424, + "tokens_seen": 1383870464 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932998996990973, + "loss": 2.7293, + "theoretical_loss": 3.541330644562634, + "tokens_seen": 1383936000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932898696088265, + "loss": 2.8563, + "theoretical_loss": 3.5413156297120647, + "tokens_seen": 1384001536 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932798395185557, + "loss": 2.8818, + "theoretical_loss": 3.5413006157715374, + "tokens_seen": 1384067072 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029326980942828484, + "loss": 2.833, + "theoretical_loss": 3.5412856027409534, + "tokens_seen": 1384132608 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932597793380141, + "loss": 2.7356, + "theoretical_loss": 3.5412705906202144, + "tokens_seen": 1384198144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932497492477432, + "loss": 2.7262, + "theoretical_loss": 3.5412555794092224, + "tokens_seen": 1384263680 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029323971915747244, + "loss": 2.8682, + "theoretical_loss": 3.5412405691078783, + "tokens_seen": 1384329216 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2213544, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.160142660140991, + "objective/train/theoretical_loss": 3.541225559716085, + "objective/train/tokens_used": 1404854752, + "theoretical_loss": 3.541225559716085, + "tokens_seen": 1384394752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932296890672016, + "loss": 2.9203, + "theoretical_loss": 3.541225559716085, + "tokens_seen": 1384394752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002932196589769308, + "loss": 2.8991, + "theoretical_loss": 3.541210551233744, + "tokens_seen": 1384460288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029320962888666, + "loss": 2.7162, + "theoretical_loss": 3.541195543660757, + "tokens_seen": 1384525824 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029319959879638916, + "loss": 2.8919, + "theoretical_loss": 3.541180536997026, + "tokens_seen": 1384591360 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029318956870611835, + "loss": 2.794, + "theoretical_loss": 3.5411655312424526, + "tokens_seen": 1384656896 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002931795386158476, + "loss": 3.011, + "theoretical_loss": 3.5411505263969394, + "tokens_seen": 1384722432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002931695085255767, + "loss": 2.8071, + "theoretical_loss": 3.541135522460387, + "tokens_seen": 1384787968 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029315947843530594, + "loss": 2.787, + "theoretical_loss": 3.541120519432699, + "tokens_seen": 1384853504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029314944834503507, + "loss": 2.9463, + "theoretical_loss": 3.5411055173137758, + "tokens_seen": 1384919040 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002931394182547643, + "loss": 2.8908, + "theoretical_loss": 3.5410905161035204, + "tokens_seen": 1384984576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002931293881644935, + "loss": 2.7829, + "theoretical_loss": 3.5410755158018343, + "tokens_seen": 1385050112 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029311935807422267, + "loss": 2.9147, + "theoretical_loss": 3.5410605164086197, + "tokens_seen": 1385115648 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029310932798395185, + "loss": 2.8394, + "theoretical_loss": 3.541045517923778, + "tokens_seen": 1385181184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002930992978936811, + "loss": 2.6496, + "theoretical_loss": 3.5410305203472126, + "tokens_seen": 1385246720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002930892678034102, + "loss": 2.8045, + "theoretical_loss": 3.5410155236788237, + "tokens_seen": 1385312256 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029307923771313945, + "loss": 2.9206, + "theoretical_loss": 3.541000527918515, + "tokens_seen": 1385377792 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002930692076228686, + "loss": 2.8353, + "theoretical_loss": 3.5409855330661877, + "tokens_seen": 1385443328 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002930591775325978, + "loss": 2.7931, + "theoretical_loss": 3.540970539121744, + "tokens_seen": 1385508864 + }, + { + "epoch": 4.01, + "learning_rate": 0.000293049147442327, + "loss": 3.0088, + "theoretical_loss": 3.540955546085086, + "tokens_seen": 1385574400 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029303911735205617, + "loss": 2.867, + "theoretical_loss": 3.5409405539561156, + "tokens_seen": 1385639936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029302908726178535, + "loss": 2.9374, + "theoretical_loss": 3.540925562734735, + "tokens_seen": 1385705472 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029301905717151453, + "loss": 2.8686, + "theoretical_loss": 3.540910572420847, + "tokens_seen": 1385771008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002930090270812437, + "loss": 2.8906, + "theoretical_loss": 3.540895583014353, + "tokens_seen": 1385836544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029299899699097295, + "loss": 2.796, + "theoretical_loss": 3.540880594515155, + "tokens_seen": 1385902080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929889669007021, + "loss": 2.7673, + "theoretical_loss": 3.5408656069231563, + "tokens_seen": 1385967616 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2214935, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7183096408843994, + "objective/train/theoretical_loss": 3.540850620238258, + "objective/train/tokens_used": 1406493152, + "theoretical_loss": 3.540850620238258, + "tokens_seen": 1386033152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929789368104313, + "loss": 2.7869, + "theoretical_loss": 3.540850620238258, + "tokens_seen": 1386033152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929689067201605, + "loss": 2.8208, + "theoretical_loss": 3.540835634460363, + "tokens_seen": 1386098688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929588766298897, + "loss": 2.7932, + "theoretical_loss": 3.5408206495893726, + "tokens_seen": 1386164224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029294884653961885, + "loss": 2.8974, + "theoretical_loss": 3.5408056656251903, + "tokens_seen": 1386229760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029293881644934804, + "loss": 2.9394, + "theoretical_loss": 3.5407906825677173, + "tokens_seen": 1386295296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929287863590772, + "loss": 2.7177, + "theoretical_loss": 3.540775700416856, + "tokens_seen": 1386360832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029291875626880645, + "loss": 2.9224, + "theoretical_loss": 3.5407607191725097, + "tokens_seen": 1386426368 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002929087261785356, + "loss": 2.7772, + "theoretical_loss": 3.5407457388345795, + "tokens_seen": 1386491904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928986960882648, + "loss": 2.9818, + "theoretical_loss": 3.540730759402968, + "tokens_seen": 1386557440 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029288866599799394, + "loss": 2.816, + "theoretical_loss": 3.540715780877578, + "tokens_seen": 1386622976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928786359077232, + "loss": 2.6988, + "theoretical_loss": 3.5407008032583116, + "tokens_seen": 1386688512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029286860581745236, + "loss": 2.8718, + "theoretical_loss": 3.5406858265450714, + "tokens_seen": 1386754048 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029285857572718154, + "loss": 2.7995, + "theoretical_loss": 3.5406708507377593, + "tokens_seen": 1386819584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928485456369107, + "loss": 2.8873, + "theoretical_loss": 3.5406558758362783, + "tokens_seen": 1386885120 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928385155466399, + "loss": 2.7851, + "theoretical_loss": 3.54064090184053, + "tokens_seen": 1386950656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928284854563691, + "loss": 2.8611, + "theoretical_loss": 3.540625928750417, + "tokens_seen": 1387016192 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002928184553660983, + "loss": 2.8542, + "theoretical_loss": 3.5406109565658426, + "tokens_seen": 1387081728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029280842527582744, + "loss": 2.8143, + "theoretical_loss": 3.540595985286708, + "tokens_seen": 1387147264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927983951855567, + "loss": 2.761, + "theoretical_loss": 3.5405810149129167, + "tokens_seen": 1387212800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927883650952859, + "loss": 2.917, + "theoretical_loss": 3.5405660454443706, + "tokens_seen": 1387278336 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029277833500501504, + "loss": 2.8674, + "theoretical_loss": 3.5405510768809725, + "tokens_seen": 1387343872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927683049147443, + "loss": 2.8045, + "theoretical_loss": 3.540536109222625, + "tokens_seen": 1387409408 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927582748244734, + "loss": 2.7943, + "theoretical_loss": 3.54052114246923, + "tokens_seen": 1387474944 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029274824473420264, + "loss": 2.9251, + "theoretical_loss": 3.540506176620691, + "tokens_seen": 1387540480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927382146439318, + "loss": 2.6951, + "theoretical_loss": 3.54049121167691, + "tokens_seen": 1387606016 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2217773, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.367779016494751, + "objective/train/theoretical_loss": 3.5404762476377893, + "objective/train/tokens_used": 1408131552, + "theoretical_loss": 3.5404762476377893, + "tokens_seen": 1387671552 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292728184553661, + "loss": 2.5776, + "theoretical_loss": 3.5404762476377893, + "tokens_seen": 1387671552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002927181544633902, + "loss": 2.7832, + "theoretical_loss": 3.540461284503232, + "tokens_seen": 1387737088 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029270812437311936, + "loss": 2.6245, + "theoretical_loss": 3.540446322273141, + "tokens_seen": 1387802624 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029269809428284855, + "loss": 2.8429, + "theoretical_loss": 3.5404313609474176, + "tokens_seen": 1387868160 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002926880641925778, + "loss": 2.8159, + "theoretical_loss": 3.5404164005259657, + "tokens_seen": 1387933696 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002926780341023069, + "loss": 2.9133, + "theoretical_loss": 3.5404014410086875, + "tokens_seen": 1387999232 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029266800401203614, + "loss": 2.8934, + "theoretical_loss": 3.540386482395485, + "tokens_seen": 1388064768 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029265797392176527, + "loss": 2.7299, + "theoretical_loss": 3.540371524686263, + "tokens_seen": 1388130304 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002926479438314945, + "loss": 2.9106, + "theoretical_loss": 3.540356567880922, + "tokens_seen": 1388195840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002926379137412237, + "loss": 2.8614, + "theoretical_loss": 3.540341611979365, + "tokens_seen": 1388261376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029262788365095287, + "loss": 2.7568, + "theoretical_loss": 3.540326656981496, + "tokens_seen": 1388326912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029261785356068205, + "loss": 2.8584, + "theoretical_loss": 3.5403117028872164, + "tokens_seen": 1388392448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002926078234704113, + "loss": 2.7925, + "theoretical_loss": 3.54029674969643, + "tokens_seen": 1388457984 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002925977933801404, + "loss": 2.8861, + "theoretical_loss": 3.5402817974090386, + "tokens_seen": 1388523520 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029258776328986965, + "loss": 2.8314, + "theoretical_loss": 3.540266846024946, + "tokens_seen": 1388589056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002925777331995988, + "loss": 2.8648, + "theoretical_loss": 3.540251895544054, + "tokens_seen": 1388654592 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292567703109328, + "loss": 2.9964, + "theoretical_loss": 3.540236945966266, + "tokens_seen": 1388720128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002925576730190572, + "loss": 2.8606, + "theoretical_loss": 3.540221997291485, + "tokens_seen": 1388785664 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029254764292878637, + "loss": 2.945, + "theoretical_loss": 3.540207049519613, + "tokens_seen": 1388851200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029253761283851555, + "loss": 2.8917, + "theoretical_loss": 3.540192102650554, + "tokens_seen": 1388916736 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029252758274824473, + "loss": 2.9901, + "theoretical_loss": 3.54017715668421, + "tokens_seen": 1388982272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002925175526579739, + "loss": 2.8468, + "theoretical_loss": 3.5401622116204843, + "tokens_seen": 1389047808 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029250752256770315, + "loss": 2.8237, + "theoretical_loss": 3.5401472674592798, + "tokens_seen": 1389113344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924974924774323, + "loss": 2.8056, + "theoretical_loss": 3.540132324200499, + "tokens_seen": 1389178880 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924874623871615, + "loss": 2.817, + "theoretical_loss": 3.5401173818440457, + "tokens_seen": 1389244416 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2220338, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9872193336486816, + "objective/train/theoretical_loss": 3.5401024403898216, + "objective/train/tokens_used": 1409769952, + "theoretical_loss": 3.5401024403898216, + "tokens_seen": 1389309952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924774322968907, + "loss": 3.0189, + "theoretical_loss": 3.5401024403898216, + "tokens_seen": 1389309952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924674022066199, + "loss": 3.0143, + "theoretical_loss": 3.540087499837731, + "tokens_seen": 1389375488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029245737211634906, + "loss": 2.8628, + "theoretical_loss": 3.540072560187676, + "tokens_seen": 1389441024 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029244734202607824, + "loss": 2.943, + "theoretical_loss": 3.54005762143956, + "tokens_seen": 1389506560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924373119358074, + "loss": 2.9574, + "theoretical_loss": 3.5400426835932857, + "tokens_seen": 1389572096 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029242728184553665, + "loss": 2.8724, + "theoretical_loss": 3.5400277466487564, + "tokens_seen": 1389637632 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002924172517552658, + "loss": 2.7658, + "theoretical_loss": 3.540012810605875, + "tokens_seen": 1389703168 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292407221664995, + "loss": 2.9805, + "theoretical_loss": 3.539997875464544, + "tokens_seen": 1389768704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029239719157472414, + "loss": 2.8302, + "theoretical_loss": 3.539982941224668, + "tokens_seen": 1389834240 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923871614844534, + "loss": 2.9003, + "theoretical_loss": 3.5399680078861486, + "tokens_seen": 1389899776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029237713139418256, + "loss": 2.7721, + "theoretical_loss": 3.5399530754488895, + "tokens_seen": 1389965312 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029236710130391174, + "loss": 2.8282, + "theoretical_loss": 3.539938143912794, + "tokens_seen": 1390030848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923570712136409, + "loss": 2.8477, + "theoretical_loss": 3.539923213277765, + "tokens_seen": 1390096384 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923470411233701, + "loss": 2.7981, + "theoretical_loss": 3.5399082835437055, + "tokens_seen": 1390161920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923370110330993, + "loss": 2.8445, + "theoretical_loss": 3.539893354710519, + "tokens_seen": 1390227456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923269809428285, + "loss": 2.8438, + "theoretical_loss": 3.5398784267781083, + "tokens_seen": 1390292992 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029231695085255765, + "loss": 2.8652, + "theoretical_loss": 3.5398634997463767, + "tokens_seen": 1390358528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002923069207622869, + "loss": 2.993, + "theoretical_loss": 3.539848573615228, + "tokens_seen": 1390424064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029229689067201606, + "loss": 2.8776, + "theoretical_loss": 3.5398336483845645, + "tokens_seen": 1390489600 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029228686058174524, + "loss": 2.8789, + "theoretical_loss": 3.5398187240542898, + "tokens_seen": 1390555136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002922768304914744, + "loss": 2.891, + "theoretical_loss": 3.5398038006243073, + "tokens_seen": 1390620672 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002922668004012036, + "loss": 2.9071, + "theoretical_loss": 3.5397888780945204, + "tokens_seen": 1390686208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002922567703109328, + "loss": 2.8383, + "theoretical_loss": 3.539773956464832, + "tokens_seen": 1390751744 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292246740220662, + "loss": 2.7993, + "theoretical_loss": 3.539759035735145, + "tokens_seen": 1390817280 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029223671013039115, + "loss": 2.8608, + "theoretical_loss": 3.539744115905364, + "tokens_seen": 1390882816 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2223119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.669797897338867, + "objective/train/theoretical_loss": 3.539729196975391, + "objective/train/tokens_used": 1411408352, + "theoretical_loss": 3.539729196975391, + "tokens_seen": 1390948352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002922266800401204, + "loss": 2.8383, + "theoretical_loss": 3.539729196975391, + "tokens_seen": 1390948352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002922166499498495, + "loss": 2.7081, + "theoretical_loss": 3.53971427894513, + "tokens_seen": 1391013888 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029220661985957875, + "loss": 2.7716, + "theoretical_loss": 3.5396993618144847, + "tokens_seen": 1391079424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002921965897693079, + "loss": 2.9111, + "theoretical_loss": 3.539684445583358, + "tokens_seen": 1391144960 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002921865596790371, + "loss": 2.8828, + "theoretical_loss": 3.539669530251653, + "tokens_seen": 1391210496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002921765295887663, + "loss": 2.9484, + "theoretical_loss": 3.5396546158192734, + "tokens_seen": 1391276032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029216649949849547, + "loss": 2.9892, + "theoretical_loss": 3.5396397022861237, + "tokens_seen": 1391341568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029215646940822465, + "loss": 2.8846, + "theoretical_loss": 3.5396247896521054, + "tokens_seen": 1391407104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002921464393179539, + "loss": 2.8724, + "theoretical_loss": 3.5396098779171226, + "tokens_seen": 1391472640 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292136409227683, + "loss": 2.7472, + "theoretical_loss": 3.5395949670810793, + "tokens_seen": 1391538176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029212637913741225, + "loss": 2.7827, + "theoretical_loss": 3.539580057143879, + "tokens_seen": 1391603712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029211634904714143, + "loss": 2.7543, + "theoretical_loss": 3.5395651481054244, + "tokens_seen": 1391669248 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002921063189568706, + "loss": 2.8215, + "theoretical_loss": 3.539550239965619, + "tokens_seen": 1391734784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002920962888665998, + "loss": 2.7522, + "theoretical_loss": 3.539535332724368, + "tokens_seen": 1391800320 + }, + { + "epoch": 4.01, + "learning_rate": 0.000292086258776329, + "loss": 2.7097, + "theoretical_loss": 3.539520426381573, + "tokens_seen": 1391865856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029207622868605815, + "loss": 2.813, + "theoretical_loss": 3.539505520937138, + "tokens_seen": 1391931392 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002920661985957874, + "loss": 2.7255, + "theoretical_loss": 3.5394906163909674, + "tokens_seen": 1391996928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002920561685055165, + "loss": 2.6131, + "theoretical_loss": 3.5394757127429637, + "tokens_seen": 1392062464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029204613841524575, + "loss": 2.9147, + "theoretical_loss": 3.5394608099930314, + "tokens_seen": 1392128000 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029203610832497493, + "loss": 2.6442, + "theoretical_loss": 3.539445908141074, + "tokens_seen": 1392193536 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002920260782347041, + "loss": 2.8891, + "theoretical_loss": 3.539431007186994, + "tokens_seen": 1392259072 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029201604814443335, + "loss": 2.932, + "theoretical_loss": 3.5394161071306964, + "tokens_seen": 1392324608 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002920060180541625, + "loss": 2.8414, + "theoretical_loss": 3.5394012079720847, + "tokens_seen": 1392390144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002919959879638917, + "loss": 2.7219, + "theoretical_loss": 3.5393863097110616, + "tokens_seen": 1392455680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002919859578736209, + "loss": 2.8346, + "theoretical_loss": 3.5393714123475313, + "tokens_seen": 1392521216 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2225839, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3396177291870117, + "objective/train/theoretical_loss": 3.539356515881398, + "objective/train/tokens_used": 1413046752, + "theoretical_loss": 3.539356515881398, + "tokens_seen": 1392586752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002919759277833501, + "loss": 2.7672, + "theoretical_loss": 3.539356515881398, + "tokens_seen": 1392586752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029196589769307926, + "loss": 2.8099, + "theoretical_loss": 3.539341620312565, + "tokens_seen": 1392652288 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029195586760280844, + "loss": 2.8768, + "theoretical_loss": 3.5393267256409358, + "tokens_seen": 1392717824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002919458375125376, + "loss": 2.7898, + "theoretical_loss": 3.5393118318664145, + "tokens_seen": 1392783360 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029193580742226685, + "loss": 2.6864, + "theoretical_loss": 3.5392969389889046, + "tokens_seen": 1392848896 + }, + { + "epoch": 4.01, + "learning_rate": 0.000291925777331996, + "loss": 2.7513, + "theoretical_loss": 3.53928204700831, + "tokens_seen": 1392914432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002919157472417252, + "loss": 2.5414, + "theoretical_loss": 3.5392671559245352, + "tokens_seen": 1392979968 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029190571715145434, + "loss": 2.876, + "theoretical_loss": 3.539252265737482, + "tokens_seen": 1393045504 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918956870611836, + "loss": 2.7746, + "theoretical_loss": 3.5392373764470566, + "tokens_seen": 1393111040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029188565697091276, + "loss": 2.9082, + "theoretical_loss": 3.5392224880531615, + "tokens_seen": 1393176576 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029187562688064194, + "loss": 2.8014, + "theoretical_loss": 3.5392076005557005, + "tokens_seen": 1393242112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918655967903711, + "loss": 2.8545, + "theoretical_loss": 3.539192713954578, + "tokens_seen": 1393307648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918555667001003, + "loss": 2.9897, + "theoretical_loss": 3.5391778282496977, + "tokens_seen": 1393373184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918455366098295, + "loss": 2.821, + "theoretical_loss": 3.539162943440963, + "tokens_seen": 1393438720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918355065195587, + "loss": 2.883, + "theoretical_loss": 3.5391480595282787, + "tokens_seen": 1393504256 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029182547642928785, + "loss": 2.769, + "theoretical_loss": 3.539133176511548, + "tokens_seen": 1393569792 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002918154463390171, + "loss": 2.7716, + "theoretical_loss": 3.539118294390675, + "tokens_seen": 1393635328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029180541624874626, + "loss": 2.7896, + "theoretical_loss": 3.5391034131655643, + "tokens_seen": 1393700864 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029179538615847544, + "loss": 2.916, + "theoretical_loss": 3.539088532836119, + "tokens_seen": 1393766400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917853560682046, + "loss": 2.8271, + "theoretical_loss": 3.539073653402243, + "tokens_seen": 1393831936 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917753259779338, + "loss": 2.8437, + "theoretical_loss": 3.5390587748638414, + "tokens_seen": 1393897472 + }, + { + "epoch": 4.01, + "learning_rate": 0.000291765295887663, + "loss": 2.8241, + "theoretical_loss": 3.5390438972208167, + "tokens_seen": 1393963008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917552657973922, + "loss": 2.6951, + "theoretical_loss": 3.539029020473074, + "tokens_seen": 1394028544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029174523570712135, + "loss": 2.7796, + "theoretical_loss": 3.5390141446205177, + "tokens_seen": 1394094080 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917352056168506, + "loss": 2.8387, + "theoretical_loss": 3.5389992696630506, + "tokens_seen": 1394159616 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2228465, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6139564514160156, + "objective/train/theoretical_loss": 3.5389843956005773, + "objective/train/tokens_used": 1414685152, + "theoretical_loss": 3.5389843956005773, + "tokens_seen": 1394225152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917251755265797, + "loss": 2.7959, + "theoretical_loss": 3.5389843956005773, + "tokens_seen": 1394225152 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029171514543630895, + "loss": 2.6832, + "theoretical_loss": 3.5389695224330024, + "tokens_seen": 1394290688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002917051153460381, + "loss": 2.7169, + "theoretical_loss": 3.538954650160229, + "tokens_seen": 1394356224 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002916950852557673, + "loss": 2.8368, + "theoretical_loss": 3.5389397787821624, + "tokens_seen": 1394421760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002916850551654965, + "loss": 2.7501, + "theoretical_loss": 3.5389249082987058, + "tokens_seen": 1394487296 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029167502507522567, + "loss": 2.6969, + "theoretical_loss": 3.538910038709764, + "tokens_seen": 1394552832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029166499498495485, + "loss": 2.6589, + "theoretical_loss": 3.5388951700152402, + "tokens_seen": 1394618368 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002916549648946841, + "loss": 2.9206, + "theoretical_loss": 3.5388803022150395, + "tokens_seen": 1394683904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002916449348044132, + "loss": 2.8414, + "theoretical_loss": 3.5388654353090656, + "tokens_seen": 1394749440 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029163490471414245, + "loss": 2.5723, + "theoretical_loss": 3.538850569297223, + "tokens_seen": 1394814976 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029162487462387163, + "loss": 2.7766, + "theoretical_loss": 3.538835704179416, + "tokens_seen": 1394880512 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002916148445336008, + "loss": 2.7783, + "theoretical_loss": 3.5388208399555485, + "tokens_seen": 1394946048 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029160481444333, + "loss": 2.857, + "theoretical_loss": 3.538805976625525, + "tokens_seen": 1395011584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915947843530592, + "loss": 2.9488, + "theoretical_loss": 3.5387911141892494, + "tokens_seen": 1395077120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029158475426278835, + "loss": 2.8488, + "theoretical_loss": 3.5387762526466267, + "tokens_seen": 1395142656 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915747241725176, + "loss": 2.8129, + "theoretical_loss": 3.53876139199756, + "tokens_seen": 1395208192 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915646940822467, + "loss": 2.7679, + "theoretical_loss": 3.5387465322419547, + "tokens_seen": 1395273728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029155466399197595, + "loss": 2.678, + "theoretical_loss": 3.5387316733797145, + "tokens_seen": 1395339264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915446339017051, + "loss": 2.829, + "theoretical_loss": 3.5387168154107442, + "tokens_seen": 1395404800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915346038114343, + "loss": 2.845, + "theoretical_loss": 3.538701958334948, + "tokens_seen": 1395470336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915245737211635, + "loss": 2.6609, + "theoretical_loss": 3.5386871021522297, + "tokens_seen": 1395535872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002915145436308927, + "loss": 2.9303, + "theoretical_loss": 3.5386722468624945, + "tokens_seen": 1395601408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029150451354062186, + "loss": 2.8216, + "theoretical_loss": 3.5386573924656464, + "tokens_seen": 1395666944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914944834503511, + "loss": 2.9092, + "theoretical_loss": 3.5386425389615903, + "tokens_seen": 1395732480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914844533600802, + "loss": 2.7907, + "theoretical_loss": 3.5386276863502295, + "tokens_seen": 1395798016 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2229796, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5063087940216064, + "objective/train/theoretical_loss": 3.5386128346314694, + "objective/train/tokens_used": 1416323552, + "theoretical_loss": 3.5386128346314694, + "tokens_seen": 1395863552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029147442326980946, + "loss": 2.6825, + "theoretical_loss": 3.5386128346314694, + "tokens_seen": 1395863552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914643931795386, + "loss": 2.828, + "theoretical_loss": 3.5385979838052144, + "tokens_seen": 1395929088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914543630892678, + "loss": 2.8455, + "theoretical_loss": 3.5385831338713682, + "tokens_seen": 1395994624 + }, + { + "epoch": 4.01, + "learning_rate": 0.000291444332998997, + "loss": 2.7952, + "theoretical_loss": 3.5385682848298363, + "tokens_seen": 1396060160 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914343029087262, + "loss": 2.7784, + "theoretical_loss": 3.5385534366805227, + "tokens_seen": 1396125696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029142427281845536, + "loss": 2.668, + "theoretical_loss": 3.538538589423332, + "tokens_seen": 1396191232 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029141424272818454, + "loss": 2.6876, + "theoretical_loss": 3.5385237430581684, + "tokens_seen": 1396256768 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002914042126379137, + "loss": 2.7658, + "theoretical_loss": 3.5385088975849373, + "tokens_seen": 1396322304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029139418254764296, + "loss": 2.951, + "theoretical_loss": 3.538494053003542, + "tokens_seen": 1396387840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002913841524573721, + "loss": 2.692, + "theoretical_loss": 3.5384792093138877, + "tokens_seen": 1396453376 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002913741223671013, + "loss": 2.9081, + "theoretical_loss": 3.5384643665158793, + "tokens_seen": 1396518912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029136409227683045, + "loss": 2.8283, + "theoretical_loss": 3.5384495246094207, + "tokens_seen": 1396584448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002913540621865597, + "loss": 2.9296, + "theoretical_loss": 3.5384346835944176, + "tokens_seen": 1396649984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029134403209628886, + "loss": 2.9096, + "theoretical_loss": 3.538419843470774, + "tokens_seen": 1396715520 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029133400200601805, + "loss": 2.8766, + "theoretical_loss": 3.5384050042383937, + "tokens_seen": 1396781056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002913239719157472, + "loss": 2.791, + "theoretical_loss": 3.538390165897183, + "tokens_seen": 1396846592 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029131394182547646, + "loss": 2.9314, + "theoretical_loss": 3.5383753284470454, + "tokens_seen": 1396912128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002913039117352056, + "loss": 2.7045, + "theoretical_loss": 3.538360491887886, + "tokens_seen": 1396977664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912938816449348, + "loss": 2.84, + "theoretical_loss": 3.5383456562196094, + "tokens_seen": 1397043200 + }, + { + "epoch": 4.01, + "learning_rate": 0.000291283851554664, + "loss": 2.8857, + "theoretical_loss": 3.5383308214421203, + "tokens_seen": 1397108736 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912738214643932, + "loss": 2.6866, + "theoretical_loss": 3.5383159875553236, + "tokens_seen": 1397174272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912637913741224, + "loss": 2.659, + "theoretical_loss": 3.538301154559124, + "tokens_seen": 1397239808 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029125376128385155, + "loss": 2.5849, + "theoretical_loss": 3.538286322453426, + "tokens_seen": 1397305344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912437311935808, + "loss": 3.0238, + "theoretical_loss": 3.5382714912381346, + "tokens_seen": 1397370880 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912337011033099, + "loss": 2.7823, + "theoretical_loss": 3.5382566609131545, + "tokens_seen": 1397436416 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2232784, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7375376224517822, + "objective/train/theoretical_loss": 3.5382418314783903, + "objective/train/tokens_used": 1417961952, + "theoretical_loss": 3.5382418314783903, + "tokens_seen": 1397501952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029122367101303915, + "loss": 2.7737, + "theoretical_loss": 3.5382418314783903, + "tokens_seen": 1397501952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029121364092276833, + "loss": 2.7205, + "theoretical_loss": 3.5382270029337475, + "tokens_seen": 1397567488 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002912036108324975, + "loss": 2.7027, + "theoretical_loss": 3.5382121752791305, + "tokens_seen": 1397633024 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002911935807422267, + "loss": 2.7368, + "theoretical_loss": 3.5381973485144442, + "tokens_seen": 1397698560 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029118355065195587, + "loss": 2.8079, + "theoretical_loss": 3.5381825226395938, + "tokens_seen": 1397764096 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029117352056168505, + "loss": 2.7001, + "theoretical_loss": 3.538167697654483, + "tokens_seen": 1397829632 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002911634904714143, + "loss": 2.9278, + "theoretical_loss": 3.538152873559018, + "tokens_seen": 1397895168 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002911534603811434, + "loss": 2.7493, + "theoretical_loss": 3.538138050353103, + "tokens_seen": 1397960704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029114343029087265, + "loss": 2.8864, + "theoretical_loss": 3.5381232280366435, + "tokens_seen": 1398026240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029113340020060183, + "loss": 2.7291, + "theoretical_loss": 3.5381084066095436, + "tokens_seen": 1398091776 + }, + { + "epoch": 4.01, + "learning_rate": 0.000291123370110331, + "loss": 2.7011, + "theoretical_loss": 3.538093586071709, + "tokens_seen": 1398157312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002911133400200602, + "loss": 2.8184, + "theoretical_loss": 3.5380787664230446, + "tokens_seen": 1398222848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002911033099297894, + "loss": 2.7221, + "theoretical_loss": 3.5380639476634546, + "tokens_seen": 1398288384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029109327983951855, + "loss": 2.7254, + "theoretical_loss": 3.538049129792845, + "tokens_seen": 1398353920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910832497492478, + "loss": 2.6584, + "theoretical_loss": 3.5380343128111202, + "tokens_seen": 1398419456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910732196589769, + "loss": 2.948, + "theoretical_loss": 3.5380194967181855, + "tokens_seen": 1398484992 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029106318956870615, + "loss": 2.7915, + "theoretical_loss": 3.5380046815139456, + "tokens_seen": 1398550528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910531594784353, + "loss": 2.7337, + "theoretical_loss": 3.5379898671983065, + "tokens_seen": 1398616064 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910431293881645, + "loss": 2.7631, + "theoretical_loss": 3.537975053771172, + "tokens_seen": 1398681600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910330992978937, + "loss": 2.6882, + "theoretical_loss": 3.537960241232448, + "tokens_seen": 1398747136 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910230692076229, + "loss": 2.8705, + "theoretical_loss": 3.537945429582039, + "tokens_seen": 1398812672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029101303911735206, + "loss": 2.7305, + "theoretical_loss": 3.5379306188198507, + "tokens_seen": 1398878208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002910030090270813, + "loss": 2.8688, + "theoretical_loss": 3.5379158089457876, + "tokens_seen": 1398943744 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002909929789368104, + "loss": 2.9237, + "theoretical_loss": 3.5379009999597555, + "tokens_seen": 1399009280 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029098294884653966, + "loss": 2.8622, + "theoretical_loss": 3.5378861918616593, + "tokens_seen": 1399074816 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2235644, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.718377113342285, + "objective/train/theoretical_loss": 3.5378713846514045, + "objective/train/tokens_used": 1419600352, + "theoretical_loss": 3.5378713846514045, + "tokens_seen": 1399140352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002909729187562688, + "loss": 2.8701, + "theoretical_loss": 3.5378713846514045, + "tokens_seen": 1399140352 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290962888665998, + "loss": 2.7807, + "theoretical_loss": 3.5378565783288956, + "tokens_seen": 1399205888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002909528585757272, + "loss": 2.7636, + "theoretical_loss": 3.537841772894038, + "tokens_seen": 1399271424 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002909428284854564, + "loss": 2.7587, + "theoretical_loss": 3.5378269683467374, + "tokens_seen": 1399336960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029093279839518556, + "loss": 2.8222, + "theoretical_loss": 3.5378121646868985, + "tokens_seen": 1399402496 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029092276830491474, + "loss": 2.8655, + "theoretical_loss": 3.5377973619144267, + "tokens_seen": 1399468032 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002909127382146439, + "loss": 2.8061, + "theoretical_loss": 3.5377825600292274, + "tokens_seen": 1399533568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029090270812437316, + "loss": 2.6234, + "theoretical_loss": 3.5377677590312056, + "tokens_seen": 1399599104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002908926780341023, + "loss": 2.8063, + "theoretical_loss": 3.537752958920267, + "tokens_seen": 1399664640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002908826479438315, + "loss": 2.877, + "theoretical_loss": 3.5377381596963167, + "tokens_seen": 1399730176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029087261785356065, + "loss": 2.8413, + "theoretical_loss": 3.5377233613592596, + "tokens_seen": 1399795712 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002908625877632899, + "loss": 2.8274, + "theoretical_loss": 3.5377085639090016, + "tokens_seen": 1399861248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029085255767301906, + "loss": 2.7406, + "theoretical_loss": 3.5376937673454476, + "tokens_seen": 1399926784 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029084252758274825, + "loss": 2.8969, + "theoretical_loss": 3.5376789716685035, + "tokens_seen": 1399992320 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002908324974924774, + "loss": 2.8942, + "theoretical_loss": 3.5376641768780743, + "tokens_seen": 1400057856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029082246740220666, + "loss": 2.8148, + "theoretical_loss": 3.537649382974066, + "tokens_seen": 1400123392 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002908124373119358, + "loss": 2.779, + "theoretical_loss": 3.5376345899563826, + "tokens_seen": 1400188928 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290802407221665, + "loss": 2.9249, + "theoretical_loss": 3.5376197978249304, + "tokens_seen": 1400254464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029079237713139415, + "loss": 2.9813, + "theoretical_loss": 3.537605006579615, + "tokens_seen": 1400320000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002907823470411234, + "loss": 2.808, + "theoretical_loss": 3.537590216220342, + "tokens_seen": 1400385536 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029077231695085257, + "loss": 2.9357, + "theoretical_loss": 3.5375754267470163, + "tokens_seen": 1400451072 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029076228686058175, + "loss": 2.7614, + "theoretical_loss": 3.537560638159544, + "tokens_seen": 1400516608 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029075225677031093, + "loss": 2.819, + "theoretical_loss": 3.537545850457829, + "tokens_seen": 1400582144 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002907422266800401, + "loss": 2.887, + "theoretical_loss": 3.5375310636417794, + "tokens_seen": 1400647680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002907321965897693, + "loss": 2.708, + "theoretical_loss": 3.5375162777112985, + "tokens_seen": 1400713216 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2238363, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4105334281921387, + "objective/train/theoretical_loss": 3.537501492666293, + "objective/train/tokens_used": 1421238752, + "theoretical_loss": 3.537501492666293, + "tokens_seen": 1400778752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029072216649949853, + "loss": 2.5718, + "theoretical_loss": 3.537501492666293, + "tokens_seen": 1400778752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029071213640922765, + "loss": 2.8612, + "theoretical_loss": 3.537486708506668, + "tokens_seen": 1400844288 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002907021063189569, + "loss": 2.921, + "theoretical_loss": 3.537471925232329, + "tokens_seen": 1400909824 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290692076228686, + "loss": 2.8234, + "theoretical_loss": 3.5374571428431816, + "tokens_seen": 1400975360 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029068204613841525, + "loss": 2.754, + "theoretical_loss": 3.5374423613391324, + "tokens_seen": 1401040896 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029067201604814443, + "loss": 2.8201, + "theoretical_loss": 3.5374275807200855, + "tokens_seen": 1401106432 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002906619859578736, + "loss": 2.8636, + "theoretical_loss": 3.537412800985947, + "tokens_seen": 1401171968 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002906519558676028, + "loss": 2.8512, + "theoretical_loss": 3.537398022136623, + "tokens_seen": 1401237504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029064192577733203, + "loss": 2.6632, + "theoretical_loss": 3.5373832441720188, + "tokens_seen": 1401303040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029063189568706116, + "loss": 2.7577, + "theoretical_loss": 3.53736846709204, + "tokens_seen": 1401368576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002906218655967904, + "loss": 2.8264, + "theoretical_loss": 3.5373536908965924, + "tokens_seen": 1401434112 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002906118355065195, + "loss": 2.8378, + "theoretical_loss": 3.5373389155855817, + "tokens_seen": 1401499648 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029060180541624876, + "loss": 2.8689, + "theoretical_loss": 3.537324141158914, + "tokens_seen": 1401565184 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029059177532597794, + "loss": 2.74, + "theoretical_loss": 3.537309367616494, + "tokens_seen": 1401630720 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905817452357071, + "loss": 2.9041, + "theoretical_loss": 3.537294594958228, + "tokens_seen": 1401696256 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905717151454363, + "loss": 2.9421, + "theoretical_loss": 3.537279823184022, + "tokens_seen": 1401761792 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905616850551655, + "loss": 2.8039, + "theoretical_loss": 3.5372650522937823, + "tokens_seen": 1401827328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029055165496489466, + "loss": 2.7814, + "theoretical_loss": 3.5372502822874132, + "tokens_seen": 1401892864 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905416248746239, + "loss": 2.8236, + "theoretical_loss": 3.537235513164821, + "tokens_seen": 1401958400 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905315947843531, + "loss": 2.8869, + "theoretical_loss": 3.5372207449259125, + "tokens_seen": 1402023936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029052156469408226, + "loss": 2.9397, + "theoretical_loss": 3.537205977570592, + "tokens_seen": 1402089472 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905115346038115, + "loss": 2.8201, + "theoretical_loss": 3.537191211098767, + "tokens_seen": 1402155008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002905015045135406, + "loss": 2.9464, + "theoretical_loss": 3.5371764455103416, + "tokens_seen": 1402220544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029049147442326986, + "loss": 2.8694, + "theoretical_loss": 3.537161680805223, + "tokens_seen": 1402286080 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290481444332999, + "loss": 2.8823, + "theoretical_loss": 3.5371469169833163, + "tokens_seen": 1402351616 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2241241, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8313655853271484, + "objective/train/theoretical_loss": 3.537132154044528, + "objective/train/tokens_used": 1422877152, + "theoretical_loss": 3.537132154044528, + "tokens_seen": 1402417152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002904714142427282, + "loss": 2.7531, + "theoretical_loss": 3.537132154044528, + "tokens_seen": 1402417152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002904613841524574, + "loss": 2.8526, + "theoretical_loss": 3.5371173919887635, + "tokens_seen": 1402482688 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002904513540621866, + "loss": 2.8953, + "theoretical_loss": 3.5371026308159292, + "tokens_seen": 1402548224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029044132397191576, + "loss": 2.7767, + "theoretical_loss": 3.5370878705259305, + "tokens_seen": 1402613760 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029043129388164494, + "loss": 2.8602, + "theoretical_loss": 3.5370731111186737, + "tokens_seen": 1402679296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002904212637913741, + "loss": 2.7995, + "theoretical_loss": 3.537058352594065, + "tokens_seen": 1402744832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029041123370110336, + "loss": 2.8968, + "theoretical_loss": 3.53704359495201, + "tokens_seen": 1402810368 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002904012036108325, + "loss": 2.7983, + "theoretical_loss": 3.5370288381924144, + "tokens_seen": 1402875904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002903911735205617, + "loss": 2.7582, + "theoretical_loss": 3.537014082315185, + "tokens_seen": 1402941440 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029038114343029085, + "loss": 2.9402, + "theoretical_loss": 3.536999327320227, + "tokens_seen": 1403006976 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002903711133400201, + "loss": 2.773, + "theoretical_loss": 3.536984573207447, + "tokens_seen": 1403072512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029036108324974926, + "loss": 2.8263, + "theoretical_loss": 3.5369698199767514, + "tokens_seen": 1403138048 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029035105315947845, + "loss": 2.8995, + "theoretical_loss": 3.5369550676280452, + "tokens_seen": 1403203584 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002903410230692076, + "loss": 2.9105, + "theoretical_loss": 3.5369403161612354, + "tokens_seen": 1403269120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029033099297893686, + "loss": 2.7895, + "theoretical_loss": 3.5369255655762277, + "tokens_seen": 1403334656 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290320962888666, + "loss": 2.8077, + "theoretical_loss": 3.536910815872928, + "tokens_seen": 1403400192 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002903109327983952, + "loss": 2.7444, + "theoretical_loss": 3.536896067051243, + "tokens_seen": 1403465728 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029030090270812435, + "loss": 2.8192, + "theoretical_loss": 3.536881319111078, + "tokens_seen": 1403531264 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002902908726178536, + "loss": 2.937, + "theoretical_loss": 3.5368665720523405, + "tokens_seen": 1403596800 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029028084252758277, + "loss": 2.8576, + "theoretical_loss": 3.5368518258749355, + "tokens_seen": 1403662336 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029027081243731195, + "loss": 2.8176, + "theoretical_loss": 3.5368370805787697, + "tokens_seen": 1403727872 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029026078234704113, + "loss": 2.7239, + "theoretical_loss": 3.5368223361637483, + "tokens_seen": 1403793408 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002902507522567703, + "loss": 2.7348, + "theoretical_loss": 3.5368075926297795, + "tokens_seen": 1403858944 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002902407221664995, + "loss": 2.9098, + "theoretical_loss": 3.536792849976768, + "tokens_seen": 1403924480 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029023069207622873, + "loss": 2.7769, + "theoretical_loss": 3.5367781082046204, + "tokens_seen": 1403990016 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2242645, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.811385154724121, + "objective/train/theoretical_loss": 3.5367633673132426, + "objective/train/tokens_used": 1424515552, + "theoretical_loss": 3.5367633673132426, + "tokens_seen": 1404055552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029022066198595785, + "loss": 2.9031, + "theoretical_loss": 3.5367633673132426, + "tokens_seen": 1404055552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002902106318956871, + "loss": 2.7142, + "theoretical_loss": 3.5367486273025417, + "tokens_seen": 1404121088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002902006018054162, + "loss": 2.7702, + "theoretical_loss": 3.5367338881724235, + "tokens_seen": 1404186624 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029019057171514545, + "loss": 2.6247, + "theoretical_loss": 3.536719149922794, + "tokens_seen": 1404252160 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029018054162487463, + "loss": 2.8229, + "theoretical_loss": 3.53670441255356, + "tokens_seen": 1404317696 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002901705115346038, + "loss": 2.8689, + "theoretical_loss": 3.536689676064628, + "tokens_seen": 1404383232 + }, + { + "epoch": 4.01, + "learning_rate": 0.000290160481444333, + "loss": 2.6915, + "theoretical_loss": 3.5366749404559035, + "tokens_seen": 1404448768 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029015045135406223, + "loss": 2.7894, + "theoretical_loss": 3.5366602057272933, + "tokens_seen": 1404514304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029014042126379136, + "loss": 2.9666, + "theoretical_loss": 3.536645471878704, + "tokens_seen": 1404579840 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002901303911735206, + "loss": 2.852, + "theoretical_loss": 3.536630738910042, + "tokens_seen": 1404645376 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002901203610832497, + "loss": 2.6389, + "theoretical_loss": 3.536616006821214, + "tokens_seen": 1404710912 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029011033099297896, + "loss": 2.6382, + "theoretical_loss": 3.5366012756121252, + "tokens_seen": 1404776448 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029010030090270814, + "loss": 2.6671, + "theoretical_loss": 3.536586545282683, + "tokens_seen": 1404841984 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900902708124373, + "loss": 2.8553, + "theoretical_loss": 3.5365718158327937, + "tokens_seen": 1404907520 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900802407221665, + "loss": 2.8113, + "theoretical_loss": 3.5365570872623637, + "tokens_seen": 1404973056 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900702106318957, + "loss": 2.7389, + "theoretical_loss": 3.536542359571299, + "tokens_seen": 1405038592 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029006018054162486, + "loss": 2.641, + "theoretical_loss": 3.5365276327595065, + "tokens_seen": 1405104128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900501504513541, + "loss": 2.8656, + "theoretical_loss": 3.5365129068268932, + "tokens_seen": 1405169664 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900401203610832, + "loss": 2.8667, + "theoretical_loss": 3.536498181773365, + "tokens_seen": 1405235200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029003009027081246, + "loss": 2.8061, + "theoretical_loss": 3.536483457598828, + "tokens_seen": 1405300736 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900200601805416, + "loss": 2.8864, + "theoretical_loss": 3.53646873430319, + "tokens_seen": 1405366272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002900100300902708, + "loss": 2.8501, + "theoretical_loss": 3.5364540118863568, + "tokens_seen": 1405431808 + }, + { + "epoch": 4.01, + "learning_rate": 0.00029, + "loss": 2.8173, + "theoretical_loss": 3.536439290348235, + "tokens_seen": 1405497344 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899899699097292, + "loss": 2.8486, + "theoretical_loss": 3.5364245696887306, + "tokens_seen": 1405562880 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028997993981945836, + "loss": 2.863, + "theoretical_loss": 3.536409849907751, + "tokens_seen": 1405628416 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2245486, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9126782417297363, + "objective/train/theoretical_loss": 3.536395131005203, + "objective/train/tokens_used": 1426153952, + "theoretical_loss": 3.536395131005203, + "tokens_seen": 1405693952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899699097291876, + "loss": 2.6332, + "theoretical_loss": 3.536395131005203, + "tokens_seen": 1405693952 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899598796389167, + "loss": 2.8473, + "theoretical_loss": 3.536380412980993, + "tokens_seen": 1405759488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028994984954864596, + "loss": 2.8828, + "theoretical_loss": 3.536365695835027, + "tokens_seen": 1405825024 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899398194583751, + "loss": 2.8059, + "theoretical_loss": 3.536350979567213, + "tokens_seen": 1405890560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899297893681043, + "loss": 2.9774, + "theoretical_loss": 3.5363362641774554, + "tokens_seen": 1405956096 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899197592778335, + "loss": 2.9192, + "theoretical_loss": 3.5363215496656633, + "tokens_seen": 1406021632 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002899097291875627, + "loss": 2.8171, + "theoretical_loss": 3.5363068360317422, + "tokens_seen": 1406087168 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028989969909729187, + "loss": 2.8999, + "theoretical_loss": 3.536292123275599, + "tokens_seen": 1406152704 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028988966900702105, + "loss": 2.905, + "theoretical_loss": 3.5362774113971405, + "tokens_seen": 1406218240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028987963891675023, + "loss": 2.8469, + "theoretical_loss": 3.536262700396273, + "tokens_seen": 1406283776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028986960882647946, + "loss": 2.8982, + "theoretical_loss": 3.536247990272904, + "tokens_seen": 1406349312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002898595787362086, + "loss": 2.958, + "theoretical_loss": 3.5362332810269397, + "tokens_seen": 1406414848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002898495486459378, + "loss": 2.7671, + "theoretical_loss": 3.536218572658287, + "tokens_seen": 1406480384 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289839518555667, + "loss": 2.7773, + "theoretical_loss": 3.5362038651668533, + "tokens_seen": 1406545920 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002898294884653962, + "loss": 2.6672, + "theoretical_loss": 3.5361891585525447, + "tokens_seen": 1406611456 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028981945837512537, + "loss": 2.7117, + "theoretical_loss": 3.536174452815268, + "tokens_seen": 1406676992 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028980942828485455, + "loss": 2.7196, + "theoretical_loss": 3.5361597479549305, + "tokens_seen": 1406742528 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028979939819458373, + "loss": 2.9269, + "theoretical_loss": 3.5361450439714384, + "tokens_seen": 1406808064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028978936810431297, + "loss": 2.8026, + "theoretical_loss": 3.536130340864699, + "tokens_seen": 1406873600 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028977933801404215, + "loss": 2.7469, + "theoretical_loss": 3.5361156386346195, + "tokens_seen": 1406939136 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028976930792377133, + "loss": 2.6121, + "theoretical_loss": 3.5361009372811067, + "tokens_seen": 1407004672 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002897592778335005, + "loss": 2.6693, + "theoretical_loss": 3.536086236804067, + "tokens_seen": 1407070208 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002897492477432297, + "loss": 2.9895, + "theoretical_loss": 3.5360715372034077, + "tokens_seen": 1407135744 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028973921765295893, + "loss": 2.9302, + "theoretical_loss": 3.536056838479036, + "tokens_seen": 1407201280 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028972918756268805, + "loss": 2.7566, + "theoretical_loss": 3.536042140630858, + "tokens_seen": 1407266816 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2247961, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8991289138793945, + "objective/train/theoretical_loss": 3.5360274436587815, + "objective/train/tokens_used": 1427792352, + "theoretical_loss": 3.5360274436587815, + "tokens_seen": 1407332352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002897191574724173, + "loss": 2.8111, + "theoretical_loss": 3.5360274436587815, + "tokens_seen": 1407332352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002897091273821464, + "loss": 2.781, + "theoretical_loss": 3.5360127475627126, + "tokens_seen": 1407397888 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028969909729187565, + "loss": 2.8348, + "theoretical_loss": 3.5359980523425594, + "tokens_seen": 1407463424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028968906720160483, + "loss": 2.8157, + "theoretical_loss": 3.5359833579982283, + "tokens_seen": 1407528960 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289679037111334, + "loss": 2.7501, + "theoretical_loss": 3.5359686645296264, + "tokens_seen": 1407594496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002896690070210632, + "loss": 2.7909, + "theoretical_loss": 3.5359539719366606, + "tokens_seen": 1407660032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028965897693079243, + "loss": 2.8238, + "theoretical_loss": 3.535939280219238, + "tokens_seen": 1407725568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028964894684052156, + "loss": 3.0482, + "theoretical_loss": 3.5359245893772657, + "tokens_seen": 1407791104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002896389167502508, + "loss": 2.7158, + "theoretical_loss": 3.5359098994106515, + "tokens_seen": 1407856640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002896288866599799, + "loss": 2.711, + "theoretical_loss": 3.5358952103193015, + "tokens_seen": 1407922176 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028961885656970916, + "loss": 2.8809, + "theoretical_loss": 3.535880522103123, + "tokens_seen": 1407987712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028960882647943834, + "loss": 2.8173, + "theoretical_loss": 3.5358658347620233, + "tokens_seen": 1408053248 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895987963891675, + "loss": 2.7683, + "theoretical_loss": 3.53585114829591, + "tokens_seen": 1408118784 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895887662988967, + "loss": 2.7501, + "theoretical_loss": 3.535836462704689, + "tokens_seen": 1408184320 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895787362086259, + "loss": 2.7459, + "theoretical_loss": 3.535821777988269, + "tokens_seen": 1408249856 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028956870611835506, + "loss": 2.9445, + "theoretical_loss": 3.5358070941465556, + "tokens_seen": 1408315392 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895586760280843, + "loss": 2.858, + "theoretical_loss": 3.5357924111794574, + "tokens_seen": 1408380928 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895486459378134, + "loss": 2.9408, + "theoretical_loss": 3.535777729086881, + "tokens_seen": 1408446464 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028953861584754266, + "loss": 2.7784, + "theoretical_loss": 3.535763047868733, + "tokens_seen": 1408512000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895285857572718, + "loss": 2.8063, + "theoretical_loss": 3.5357483675249224, + "tokens_seen": 1408577536 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289518555667001, + "loss": 2.8024, + "theoretical_loss": 3.5357336880553545, + "tokens_seen": 1408643072 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002895085255767302, + "loss": 2.8302, + "theoretical_loss": 3.535719009459938, + "tokens_seen": 1408708608 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894984954864594, + "loss": 2.8249, + "theoretical_loss": 3.5357043317385792, + "tokens_seen": 1408774144 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028948846539618856, + "loss": 2.8231, + "theoretical_loss": 3.5356896548911863, + "tokens_seen": 1408839680 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894784353059178, + "loss": 2.788, + "theoretical_loss": 3.5356749789176654, + "tokens_seen": 1408905216 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2250734, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.980250597000122, + "objective/train/theoretical_loss": 3.5356603038179246, + "objective/train/tokens_used": 1429430752, + "theoretical_loss": 3.5356603038179246, + "tokens_seen": 1408970752 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894684052156469, + "loss": 2.8342, + "theoretical_loss": 3.5356603038179246, + "tokens_seen": 1408970752 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028945837512537616, + "loss": 2.8972, + "theoretical_loss": 3.5356456295918717, + "tokens_seen": 1409036288 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894483450351053, + "loss": 2.7511, + "theoretical_loss": 3.5356309562394133, + "tokens_seen": 1409101824 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894383149448345, + "loss": 2.9039, + "theoretical_loss": 3.535616283760457, + "tokens_seen": 1409167360 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894282848545637, + "loss": 2.9004, + "theoretical_loss": 3.5356016121549096, + "tokens_seen": 1409232896 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002894182547642929, + "loss": 2.8388, + "theoretical_loss": 3.5355869414226797, + "tokens_seen": 1409298432 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028940822467402207, + "loss": 2.8888, + "theoretical_loss": 3.5355722715636744, + "tokens_seen": 1409363968 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028939819458375125, + "loss": 2.8576, + "theoretical_loss": 3.5355576025778, + "tokens_seen": 1409429504 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028938816449348043, + "loss": 2.5243, + "theoretical_loss": 3.5355429344649654, + "tokens_seen": 1409495040 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028937813440320966, + "loss": 2.833, + "theoretical_loss": 3.535528267225077, + "tokens_seen": 1409560576 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002893681043129388, + "loss": 2.8096, + "theoretical_loss": 3.5355136008580423, + "tokens_seen": 1409626112 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289358074222668, + "loss": 2.6728, + "theoretical_loss": 3.53549893536377, + "tokens_seen": 1409691648 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002893480441323972, + "loss": 2.8033, + "theoretical_loss": 3.535484270742166, + "tokens_seen": 1409757184 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002893380140421264, + "loss": 2.7829, + "theoretical_loss": 3.5354696069931384, + "tokens_seen": 1409822720 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028932798395185557, + "loss": 2.7429, + "theoretical_loss": 3.535454944116595, + "tokens_seen": 1409888256 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028931795386158475, + "loss": 2.7215, + "theoretical_loss": 3.535440282112444, + "tokens_seen": 1409953792 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028930792377131393, + "loss": 2.8713, + "theoretical_loss": 3.5354256209805914, + "tokens_seen": 1410019328 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028929789368104317, + "loss": 2.8292, + "theoretical_loss": 3.535410960720945, + "tokens_seen": 1410084864 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002892878635907723, + "loss": 2.7503, + "theoretical_loss": 3.5353963013334138, + "tokens_seen": 1410150400 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028927783350050153, + "loss": 2.9142, + "theoretical_loss": 3.535381642817904, + "tokens_seen": 1410215936 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028926780341023066, + "loss": 2.7537, + "theoretical_loss": 3.535366985174324, + "tokens_seen": 1410281472 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002892577733199599, + "loss": 2.9748, + "theoretical_loss": 3.5353523284025807, + "tokens_seen": 1410347008 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002892477432296891, + "loss": 2.8651, + "theoretical_loss": 3.5353376725025827, + "tokens_seen": 1410412544 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028923771313941825, + "loss": 2.9366, + "theoretical_loss": 3.5353230174742363, + "tokens_seen": 1410478080 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028922768304914744, + "loss": 2.9269, + "theoretical_loss": 3.5353083633174505, + "tokens_seen": 1410543616 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2253723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8534116744995117, + "objective/train/theoretical_loss": 3.535293710032132, + "objective/train/tokens_used": 1431069152, + "theoretical_loss": 3.535293710032132, + "tokens_seen": 1410609152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002892176529588766, + "loss": 2.7486, + "theoretical_loss": 3.535293710032132, + "tokens_seen": 1410609152 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002892076228686058, + "loss": 2.789, + "theoretical_loss": 3.5352790576181894, + "tokens_seen": 1410674688 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028919759277833503, + "loss": 2.763, + "theoretical_loss": 3.5352644060755294, + "tokens_seen": 1410740224 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028918756268806416, + "loss": 2.8609, + "theoretical_loss": 3.5352497554040605, + "tokens_seen": 1410805760 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002891775325977934, + "loss": 2.9178, + "theoretical_loss": 3.5352351056036904, + "tokens_seen": 1410871296 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002891675025075226, + "loss": 2.9658, + "theoretical_loss": 3.535220456674326, + "tokens_seen": 1410936832 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028915747241725176, + "loss": 2.7331, + "theoretical_loss": 3.535205808615876, + "tokens_seen": 1411002368 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028914744232698094, + "loss": 2.7495, + "theoretical_loss": 3.535191161428248, + "tokens_seen": 1411067904 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002891374122367101, + "loss": 2.7744, + "theoretical_loss": 3.5351765151113494, + "tokens_seen": 1411133440 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002891273821464393, + "loss": 2.6604, + "theoretical_loss": 3.535161869665088, + "tokens_seen": 1411198976 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028911735205616854, + "loss": 3.0059, + "theoretical_loss": 3.5351472250893723, + "tokens_seen": 1411264512 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028910732196589766, + "loss": 2.7315, + "theoretical_loss": 3.5351325813841097, + "tokens_seen": 1411330048 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890972918756269, + "loss": 2.7345, + "theoretical_loss": 3.5351179385492078, + "tokens_seen": 1411395584 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289087261785356, + "loss": 2.9511, + "theoretical_loss": 3.5351032965845746, + "tokens_seen": 1411461120 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028907723169508526, + "loss": 2.8695, + "theoretical_loss": 3.535088655490118, + "tokens_seen": 1411526656 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028906720160481444, + "loss": 2.6876, + "theoretical_loss": 3.5350740152657467, + "tokens_seen": 1411592192 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890571715145436, + "loss": 2.7765, + "theoretical_loss": 3.535059375911367, + "tokens_seen": 1411657728 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890471414242728, + "loss": 2.8309, + "theoretical_loss": 3.5350447374268876, + "tokens_seen": 1411723264 + }, + { + "epoch": 4.01, + "learning_rate": 0.000289037111334002, + "loss": 2.9136, + "theoretical_loss": 3.535030099812217, + "tokens_seen": 1411788800 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890270812437312, + "loss": 2.9487, + "theoretical_loss": 3.5350154630672623, + "tokens_seen": 1411854336 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890170511534604, + "loss": 2.8844, + "theoretical_loss": 3.535000827191932, + "tokens_seen": 1411919872 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002890070210631896, + "loss": 2.9179, + "theoretical_loss": 3.534986192186134, + "tokens_seen": 1411985408 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028899699097291876, + "loss": 2.8332, + "theoretical_loss": 3.534971558049776, + "tokens_seen": 1412050944 + }, + { + "epoch": 4.01, + "learning_rate": 0.000288986960882648, + "loss": 2.6693, + "theoretical_loss": 3.534956924782766, + "tokens_seen": 1412116480 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002889769307923771, + "loss": 2.8655, + "theoretical_loss": 3.5349422923850122, + "tokens_seen": 1412182016 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2256780, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.505932331085205, + "objective/train/theoretical_loss": 3.5349276608564226, + "objective/train/tokens_used": 1432707552, + "theoretical_loss": 3.5349276608564226, + "tokens_seen": 1412247552 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028896690070210636, + "loss": 2.9116, + "theoretical_loss": 3.5349276608564226, + "tokens_seen": 1412247552 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002889568706118355, + "loss": 2.6413, + "theoretical_loss": 3.5349130301969054, + "tokens_seen": 1412313088 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002889468405215647, + "loss": 2.9647, + "theoretical_loss": 3.5348984004063686, + "tokens_seen": 1412378624 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002889368104312939, + "loss": 2.7943, + "theoretical_loss": 3.5348837714847194, + "tokens_seen": 1412444160 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002889267803410231, + "loss": 2.7776, + "theoretical_loss": 3.5348691434318678, + "tokens_seen": 1412509696 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028891675025075227, + "loss": 3.0221, + "theoretical_loss": 3.53485451624772, + "tokens_seen": 1412575232 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028890672016048145, + "loss": 2.8077, + "theoretical_loss": 3.5348398899321847, + "tokens_seen": 1412640768 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028889669007021063, + "loss": 2.7758, + "theoretical_loss": 3.5348252644851703, + "tokens_seen": 1412706304 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028888665997993986, + "loss": 2.8812, + "theoretical_loss": 3.534810639906585, + "tokens_seen": 1412771840 + }, + { + "epoch": 4.01, + "learning_rate": 0.000288876629889669, + "loss": 2.9368, + "theoretical_loss": 3.5347960161963368, + "tokens_seen": 1412837376 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028886659979939823, + "loss": 2.9327, + "theoretical_loss": 3.5347813933543337, + "tokens_seen": 1412902912 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002888565697091274, + "loss": 2.7436, + "theoretical_loss": 3.534766771380484, + "tokens_seen": 1412968448 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002888465396188566, + "loss": 2.91, + "theoretical_loss": 3.534752150274696, + "tokens_seen": 1413033984 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028883650952858577, + "loss": 2.9465, + "theoretical_loss": 3.5347375300368773, + "tokens_seen": 1413099520 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028882647943831495, + "loss": 2.8546, + "theoretical_loss": 3.5347229106669373, + "tokens_seen": 1413165056 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028881644934804413, + "loss": 2.655, + "theoretical_loss": 3.534708292164783, + "tokens_seen": 1413230592 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028880641925777337, + "loss": 2.717, + "theoretical_loss": 3.534693674530324, + "tokens_seen": 1413296128 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002887963891675025, + "loss": 2.7161, + "theoretical_loss": 3.5346790577634675, + "tokens_seen": 1413361664 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028878635907723173, + "loss": 2.9226, + "theoretical_loss": 3.5346644418641215, + "tokens_seen": 1413427200 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028877632898696086, + "loss": 2.8721, + "theoretical_loss": 3.5346498268321955, + "tokens_seen": 1413492736 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002887662988966901, + "loss": 2.7287, + "theoretical_loss": 3.534635212667597, + "tokens_seen": 1413558272 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002887562688064193, + "loss": 2.88, + "theoretical_loss": 3.534620599370234, + "tokens_seen": 1413623808 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028874623871614845, + "loss": 2.7821, + "theoretical_loss": 3.534605986940016, + "tokens_seen": 1413689344 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028873620862587764, + "loss": 2.5585, + "theoretical_loss": 3.5345913753768503, + "tokens_seen": 1413754880 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002887261785356068, + "loss": 2.6563, + "theoretical_loss": 3.534576764680646, + "tokens_seen": 1413820416 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2259451, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.742790937423706, + "objective/train/theoretical_loss": 3.5345621548513106, + "objective/train/tokens_used": 1434345952, + "theoretical_loss": 3.5345621548513106, + "tokens_seen": 1413885952 + }, + { + "epoch": 4.01, + "learning_rate": 0.000288716148445336, + "loss": 2.7719, + "theoretical_loss": 3.5345621548513106, + "tokens_seen": 1413885952 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028870611835506523, + "loss": 2.7012, + "theoretical_loss": 3.5345475458887536, + "tokens_seen": 1413951488 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028869608826479436, + "loss": 3.0078, + "theoretical_loss": 3.5345329377928825, + "tokens_seen": 1414017024 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002886860581745236, + "loss": 2.8316, + "theoretical_loss": 3.534518330563606, + "tokens_seen": 1414082560 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002886760280842528, + "loss": 2.9353, + "theoretical_loss": 3.5345037242008326, + "tokens_seen": 1414148096 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028866599799398196, + "loss": 2.871, + "theoretical_loss": 3.5344891187044705, + "tokens_seen": 1414213632 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028865596790371114, + "loss": 2.9007, + "theoretical_loss": 3.5344745140744287, + "tokens_seen": 1414279168 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002886459378134403, + "loss": 2.7905, + "theoretical_loss": 3.5344599103106153, + "tokens_seen": 1414344704 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002886359077231695, + "loss": 2.9376, + "theoretical_loss": 3.5344453074129385, + "tokens_seen": 1414410240 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028862587763289874, + "loss": 2.8119, + "theoretical_loss": 3.5344307053813075, + "tokens_seen": 1414475776 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028861584754262786, + "loss": 2.8282, + "theoretical_loss": 3.53441610421563, + "tokens_seen": 1414541312 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002886058174523571, + "loss": 2.7144, + "theoretical_loss": 3.534401503915815, + "tokens_seen": 1414606848 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002885957873620862, + "loss": 2.6486, + "theoretical_loss": 3.5343869044817717, + "tokens_seen": 1414672384 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028858575727181546, + "loss": 2.7814, + "theoretical_loss": 3.5343723059134073, + "tokens_seen": 1414737920 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028857572718154464, + "loss": 2.6896, + "theoretical_loss": 3.5343577082106314, + "tokens_seen": 1414803456 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002885656970912738, + "loss": 2.912, + "theoretical_loss": 3.534343111373352, + "tokens_seen": 1414868992 + }, + { + "epoch": 4.01, + "learning_rate": 0.000288555667001003, + "loss": 2.9293, + "theoretical_loss": 3.5343285154014783, + "tokens_seen": 1414934528 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002885456369107322, + "loss": 2.8758, + "theoretical_loss": 3.534313920294918, + "tokens_seen": 1415000064 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028853560682046137, + "loss": 2.7926, + "theoretical_loss": 3.5342993260535804, + "tokens_seen": 1415065600 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002885255767301906, + "loss": 2.7197, + "theoretical_loss": 3.5342847326773743, + "tokens_seen": 1415131136 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028851554663991973, + "loss": 2.9349, + "theoretical_loss": 3.5342701401662078, + "tokens_seen": 1415196672 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028850551654964896, + "loss": 2.8147, + "theoretical_loss": 3.53425554851999, + "tokens_seen": 1415262208 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028849548645937815, + "loss": 2.7569, + "theoretical_loss": 3.5342409577386293, + "tokens_seen": 1415327744 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002884854563691073, + "loss": 2.8231, + "theoretical_loss": 3.5342263678220345, + "tokens_seen": 1415393280 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002884754262788365, + "loss": 2.786, + "theoretical_loss": 3.534211778770114, + "tokens_seen": 1415458816 + }, + { + "epoch": 4.01, + "objective/train/docs_used": 2260925, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.849684476852417, + "objective/train/theoretical_loss": 3.534197190582778, + "objective/train/tokens_used": 1435984352, + "theoretical_loss": 3.534197190582778, + "tokens_seen": 1415524352 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002884653961885657, + "loss": 2.8892, + "theoretical_loss": 3.534197190582778, + "tokens_seen": 1415524352 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028845536609829487, + "loss": 2.8053, + "theoretical_loss": 3.534182603259933, + "tokens_seen": 1415589888 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002884453360080241, + "loss": 2.9752, + "theoretical_loss": 3.534168016801489, + "tokens_seen": 1415655424 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028843530591775323, + "loss": 2.7464, + "theoretical_loss": 3.534153431207355, + "tokens_seen": 1415720960 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028842527582748247, + "loss": 2.8994, + "theoretical_loss": 3.534138846477439, + "tokens_seen": 1415786496 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002884152457372116, + "loss": 2.8722, + "theoretical_loss": 3.53412426261165, + "tokens_seen": 1415852032 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028840521564694083, + "loss": 2.9402, + "theoretical_loss": 3.534109679609897, + "tokens_seen": 1415917568 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028839518555667, + "loss": 2.8654, + "theoretical_loss": 3.5340950974720893, + "tokens_seen": 1415983104 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002883851554663992, + "loss": 2.7824, + "theoretical_loss": 3.5340805161981352, + "tokens_seen": 1416048640 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002883751253761284, + "loss": 2.8861, + "theoretical_loss": 3.534065935787943, + "tokens_seen": 1416114176 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002883650952858576, + "loss": 2.8187, + "theoretical_loss": 3.5340513562414224, + "tokens_seen": 1416179712 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028835506519558674, + "loss": 2.898, + "theoretical_loss": 3.534036777558482, + "tokens_seen": 1416245248 + }, + { + "epoch": 4.01, + "learning_rate": 0.00028834503510531597, + "loss": 2.8782, + "theoretical_loss": 3.534022199739031, + "tokens_seen": 1416310784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002883350050150451, + "loss": 2.9543, + "theoretical_loss": 3.534007622782978, + "tokens_seen": 1416376320 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028832497492477433, + "loss": 2.964, + "theoretical_loss": 3.533993046690232, + "tokens_seen": 1416441856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002883149448345035, + "loss": 2.9012, + "theoretical_loss": 3.533978471460701, + "tokens_seen": 1416507392 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002883049147442327, + "loss": 2.5897, + "theoretical_loss": 3.5339638970942957, + "tokens_seen": 1416572928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002882948846539619, + "loss": 2.8278, + "theoretical_loss": 3.5339493235909236, + "tokens_seen": 1416638464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028828485456369106, + "loss": 2.9362, + "theoretical_loss": 3.533934750950495, + "tokens_seen": 1416704000 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002882748244734203, + "loss": 2.8099, + "theoretical_loss": 3.5339201791729176, + "tokens_seen": 1416769536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002882647943831495, + "loss": 2.7849, + "theoretical_loss": 3.5339056082581006, + "tokens_seen": 1416835072 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028825476429287866, + "loss": 2.6778, + "theoretical_loss": 3.5338910382059536, + "tokens_seen": 1416900608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028824473420260784, + "loss": 2.829, + "theoretical_loss": 3.533876469016386, + "tokens_seen": 1416966144 + }, + { + "epoch": 4.02, + "learning_rate": 0.000288234704112337, + "loss": 2.8149, + "theoretical_loss": 3.5338619006893053, + "tokens_seen": 1417031680 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002882246740220662, + "loss": 2.7403, + "theoretical_loss": 3.533847333224622, + "tokens_seen": 1417097216 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2264585, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9305531978607178, + "objective/train/theoretical_loss": 3.533832766622244, + "objective/train/tokens_used": 1437622752, + "theoretical_loss": 3.533832766622244, + "tokens_seen": 1417162752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028821464393179543, + "loss": 2.752, + "theoretical_loss": 3.533832766622244, + "tokens_seen": 1417162752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028820461384152456, + "loss": 2.8292, + "theoretical_loss": 3.5338182008820818, + "tokens_seen": 1417228288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002881945837512538, + "loss": 2.8638, + "theoretical_loss": 3.533803636004044, + "tokens_seen": 1417293824 + }, + { + "epoch": 4.02, + "learning_rate": 0.000288184553660983, + "loss": 2.749, + "theoretical_loss": 3.5337890719880383, + "tokens_seen": 1417359360 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028817452357071216, + "loss": 2.6767, + "theoretical_loss": 3.5337745088339756, + "tokens_seen": 1417424896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028816449348044134, + "loss": 2.7743, + "theoretical_loss": 3.5337599465417644, + "tokens_seen": 1417490432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002881544633901705, + "loss": 2.8965, + "theoretical_loss": 3.5337453851113136, + "tokens_seen": 1417555968 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002881444332998997, + "loss": 2.816, + "theoretical_loss": 3.533730824542533, + "tokens_seen": 1417621504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028813440320962894, + "loss": 2.932, + "theoretical_loss": 3.533716264835331, + "tokens_seen": 1417687040 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028812437311935806, + "loss": 2.741, + "theoretical_loss": 3.5337017059896176, + "tokens_seen": 1417752576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002881143430290873, + "loss": 2.7256, + "theoretical_loss": 3.533687148005301, + "tokens_seen": 1417818112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002881043129388164, + "loss": 2.8326, + "theoretical_loss": 3.5336725908822917, + "tokens_seen": 1417883648 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028809428284854566, + "loss": 2.9239, + "theoretical_loss": 3.5336580346204975, + "tokens_seen": 1417949184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028808425275827484, + "loss": 2.8621, + "theoretical_loss": 3.533643479219829, + "tokens_seen": 1418014720 + }, + { + "epoch": 4.02, + "learning_rate": 0.000288074222668004, + "loss": 2.8896, + "theoretical_loss": 3.5336289246801944, + "tokens_seen": 1418080256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002880641925777332, + "loss": 2.8573, + "theoretical_loss": 3.5336143710015038, + "tokens_seen": 1418145792 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002880541624874624, + "loss": 2.8264, + "theoretical_loss": 3.533599818183666, + "tokens_seen": 1418211328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028804413239719157, + "loss": 3.0221, + "theoretical_loss": 3.5335852662265905, + "tokens_seen": 1418276864 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002880341023069208, + "loss": 2.7278, + "theoretical_loss": 3.5335707151301863, + "tokens_seen": 1418342400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028802407221664993, + "loss": 2.8648, + "theoretical_loss": 3.533556164894363, + "tokens_seen": 1418407936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028801404212637916, + "loss": 2.8762, + "theoretical_loss": 3.53354161551903, + "tokens_seen": 1418473472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028800401203610835, + "loss": 2.8241, + "theoretical_loss": 3.5335270670040964, + "tokens_seen": 1418539008 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879939819458375, + "loss": 2.8665, + "theoretical_loss": 3.533512519349472, + "tokens_seen": 1418604544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879839518555667, + "loss": 2.8292, + "theoretical_loss": 3.533497972555066, + "tokens_seen": 1418670080 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879739217652959, + "loss": 2.6641, + "theoretical_loss": 3.533483426620788, + "tokens_seen": 1418735616 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2267285, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8079428672790527, + "objective/train/theoretical_loss": 3.5334688815465465, + "objective/train/tokens_used": 1439261152, + "theoretical_loss": 3.5334688815465465, + "tokens_seen": 1418801152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028796389167502507, + "loss": 2.9142, + "theoretical_loss": 3.5334688815465465, + "tokens_seen": 1418801152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879538615847543, + "loss": 2.9259, + "theoretical_loss": 3.533454337332252, + "tokens_seen": 1418866688 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028794383149448343, + "loss": 2.7759, + "theoretical_loss": 3.533439793977813, + "tokens_seen": 1418932224 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028793380140421267, + "loss": 2.7838, + "theoretical_loss": 3.53342525148314, + "tokens_seen": 1418997760 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879237713139418, + "loss": 2.7104, + "theoretical_loss": 3.533410709848142, + "tokens_seen": 1419063296 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028791374122367103, + "loss": 2.9695, + "theoretical_loss": 3.533396169072728, + "tokens_seen": 1419128832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002879037111334002, + "loss": 2.7972, + "theoretical_loss": 3.533381629156808, + "tokens_seen": 1419194368 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878936810431294, + "loss": 2.8256, + "theoretical_loss": 3.5333670901002914, + "tokens_seen": 1419259904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878836509528586, + "loss": 2.9138, + "theoretical_loss": 3.5333525519030884, + "tokens_seen": 1419325440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878736208625878, + "loss": 2.8705, + "theoretical_loss": 3.533338014565107, + "tokens_seen": 1419390976 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028786359077231694, + "loss": 2.87, + "theoretical_loss": 3.533323478086258, + "tokens_seen": 1419456512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028785356068204617, + "loss": 2.8233, + "theoretical_loss": 3.5333089424664506, + "tokens_seen": 1419522048 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878435305917753, + "loss": 2.9048, + "theoretical_loss": 3.5332944077055943, + "tokens_seen": 1419587584 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028783350050150453, + "loss": 2.9066, + "theoretical_loss": 3.533279873803599, + "tokens_seen": 1419653120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878234704112337, + "loss": 2.8326, + "theoretical_loss": 3.533265340760374, + "tokens_seen": 1419718656 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878134403209629, + "loss": 2.7953, + "theoretical_loss": 3.533250808575829, + "tokens_seen": 1419784192 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002878034102306921, + "loss": 2.9048, + "theoretical_loss": 3.5332362772498733, + "tokens_seen": 1419849728 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028779338014042126, + "loss": 2.9938, + "theoretical_loss": 3.5332217467824174, + "tokens_seen": 1419915264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028778335005015044, + "loss": 2.8541, + "theoretical_loss": 3.53320721717337, + "tokens_seen": 1419980800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002877733199598797, + "loss": 2.8895, + "theoretical_loss": 3.5331926884226417, + "tokens_seen": 1420046336 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002877632898696088, + "loss": 2.8805, + "theoretical_loss": 3.5331781605301416, + "tokens_seen": 1420111872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028775325977933804, + "loss": 2.8112, + "theoretical_loss": 3.533163633495779, + "tokens_seen": 1420177408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028774322968906716, + "loss": 2.7347, + "theoretical_loss": 3.5331491073194643, + "tokens_seen": 1420242944 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002877331995987964, + "loss": 2.8887, + "theoretical_loss": 3.5331345820011073, + "tokens_seen": 1420308480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002877231695085256, + "loss": 2.939, + "theoretical_loss": 3.5331200575406174, + "tokens_seen": 1420374016 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2268625, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.847114324569702, + "objective/train/theoretical_loss": 3.5331055339379045, + "objective/train/tokens_used": 1440899552, + "theoretical_loss": 3.5331055339379045, + "tokens_seen": 1420439552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028771313941825476, + "loss": 2.8494, + "theoretical_loss": 3.5331055339379045, + "tokens_seen": 1420439552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028770310932798394, + "loss": 2.8358, + "theoretical_loss": 3.533091011192878, + "tokens_seen": 1420505088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876930792377132, + "loss": 2.8431, + "theoretical_loss": 3.533076489305448, + "tokens_seen": 1420570624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876830491474423, + "loss": 2.833, + "theoretical_loss": 3.5330619682755247, + "tokens_seen": 1420636160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028767301905717154, + "loss": 2.9249, + "theoretical_loss": 3.533047448103017, + "tokens_seen": 1420701696 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028766298896690067, + "loss": 2.8318, + "theoretical_loss": 3.5330329287878355, + "tokens_seen": 1420767232 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876529588766299, + "loss": 2.9157, + "theoretical_loss": 3.53301841032989, + "tokens_seen": 1420832768 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876429287863591, + "loss": 2.893, + "theoretical_loss": 3.5330038927290897, + "tokens_seen": 1420898304 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028763289869608826, + "loss": 2.6892, + "theoretical_loss": 3.5329893759853452, + "tokens_seen": 1420963840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028762286860581745, + "loss": 2.8895, + "theoretical_loss": 3.532974860098566, + "tokens_seen": 1421029376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876128385155466, + "loss": 2.8802, + "theoretical_loss": 3.5329603450686617, + "tokens_seen": 1421094912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002876028084252758, + "loss": 2.9462, + "theoretical_loss": 3.5329458308955433, + "tokens_seen": 1421160448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028759277833500504, + "loss": 2.7409, + "theoretical_loss": 3.532931317579119, + "tokens_seen": 1421225984 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028758274824473417, + "loss": 2.8585, + "theoretical_loss": 3.5329168051193003, + "tokens_seen": 1421291520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002875727181544634, + "loss": 2.8885, + "theoretical_loss": 3.5329022935159964, + "tokens_seen": 1421357056 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028756268806419253, + "loss": 2.9342, + "theoretical_loss": 3.5328877827691176, + "tokens_seen": 1421422592 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028755265797392177, + "loss": 2.8266, + "theoretical_loss": 3.5328732728785734, + "tokens_seen": 1421488128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028754262788365095, + "loss": 2.7388, + "theoretical_loss": 3.532858763844274, + "tokens_seen": 1421553664 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028753259779338013, + "loss": 2.89, + "theoretical_loss": 3.53284425566613, + "tokens_seen": 1421619200 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028752256770310936, + "loss": 2.788, + "theoretical_loss": 3.5328297483440503, + "tokens_seen": 1421684736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028751253761283855, + "loss": 2.7123, + "theoretical_loss": 3.5328152418779464, + "tokens_seen": 1421750272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002875025075225677, + "loss": 2.8388, + "theoretical_loss": 3.5328007362677267, + "tokens_seen": 1421815808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002874924774322969, + "loss": 2.8359, + "theoretical_loss": 3.532786231513302, + "tokens_seen": 1421881344 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002874824473420261, + "loss": 2.8826, + "theoretical_loss": 3.5327717276145827, + "tokens_seen": 1421946880 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028747241725175527, + "loss": 2.9014, + "theoretical_loss": 3.5327572245714784, + "tokens_seen": 1422012416 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2271378, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.853529453277588, + "objective/train/theoretical_loss": 3.532742722383899, + "objective/train/tokens_used": 1442537952, + "theoretical_loss": 3.532742722383899, + "tokens_seen": 1422077952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002874623871614845, + "loss": 2.8038, + "theoretical_loss": 3.532742722383899, + "tokens_seen": 1422077952 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028745235707121363, + "loss": 2.7548, + "theoretical_loss": 3.532728221051756, + "tokens_seen": 1422143488 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028744232698094287, + "loss": 2.9076, + "theoretical_loss": 3.5327137205749577, + "tokens_seen": 1422209024 + }, + { + "epoch": 4.02, + "learning_rate": 0.000287432296890672, + "loss": 2.842, + "theoretical_loss": 3.532699220953415, + "tokens_seen": 1422274560 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028742226680040123, + "loss": 2.8683, + "theoretical_loss": 3.5326847221870388, + "tokens_seen": 1422340096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002874122367101304, + "loss": 2.7367, + "theoretical_loss": 3.532670224275738, + "tokens_seen": 1422405632 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002874022066198596, + "loss": 2.8451, + "theoretical_loss": 3.5326557272194234, + "tokens_seen": 1422471168 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002873921765295888, + "loss": 2.8478, + "theoretical_loss": 3.532641231018005, + "tokens_seen": 1422536704 + }, + { + "epoch": 4.02, + "learning_rate": 0.000287382146439318, + "loss": 2.6865, + "theoretical_loss": 3.532626735671393, + "tokens_seen": 1422602240 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028737211634904714, + "loss": 2.7385, + "theoretical_loss": 3.532612241179498, + "tokens_seen": 1422667776 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028736208625877637, + "loss": 2.7516, + "theoretical_loss": 3.53259774754223, + "tokens_seen": 1422733312 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002873520561685055, + "loss": 2.7337, + "theoretical_loss": 3.5325832547594995, + "tokens_seen": 1422798848 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028734202607823473, + "loss": 2.7278, + "theoretical_loss": 3.532568762831216, + "tokens_seen": 1422864384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002873319959879639, + "loss": 2.6366, + "theoretical_loss": 3.5325542717572906, + "tokens_seen": 1422929920 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002873219658976931, + "loss": 2.7574, + "theoretical_loss": 3.5325397815376327, + "tokens_seen": 1422995456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002873119358074223, + "loss": 2.8177, + "theoretical_loss": 3.5325252921721537, + "tokens_seen": 1423060992 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028730190571715146, + "loss": 2.8821, + "theoretical_loss": 3.532510803660763, + "tokens_seen": 1423126528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028729187562688064, + "loss": 2.9466, + "theoretical_loss": 3.5324963160033716, + "tokens_seen": 1423192064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002872818455366099, + "loss": 2.82, + "theoretical_loss": 3.5324818291998894, + "tokens_seen": 1423257600 + }, + { + "epoch": 4.02, + "learning_rate": 0.000287271815446339, + "loss": 2.752, + "theoretical_loss": 3.532467343250227, + "tokens_seen": 1423323136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028726178535606824, + "loss": 2.7457, + "theoretical_loss": 3.5324528581542944, + "tokens_seen": 1423388672 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028725175526579736, + "loss": 2.7253, + "theoretical_loss": 3.5324383739120027, + "tokens_seen": 1423454208 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002872417251755266, + "loss": 2.703, + "theoretical_loss": 3.532423890523261, + "tokens_seen": 1423519744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002872316950852558, + "loss": 2.8796, + "theoretical_loss": 3.5324094079879806, + "tokens_seen": 1423585280 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028722166499498496, + "loss": 2.8807, + "theoretical_loss": 3.5323949263060728, + "tokens_seen": 1423650816 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2274114, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.76324725151062, + "objective/train/theoretical_loss": 3.5323804454774463, + "objective/train/tokens_used": 1444176352, + "theoretical_loss": 3.5323804454774463, + "tokens_seen": 1423716352 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028721163490471414, + "loss": 2.91, + "theoretical_loss": 3.5323804454774463, + "tokens_seen": 1423716352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002872016048144434, + "loss": 2.758, + "theoretical_loss": 3.5323659655020125, + "tokens_seen": 1423781888 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002871915747241725, + "loss": 2.9362, + "theoretical_loss": 3.5323514863796817, + "tokens_seen": 1423847424 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028718154463390174, + "loss": 2.8532, + "theoretical_loss": 3.5323370081103644, + "tokens_seen": 1423912960 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028717151454363087, + "loss": 2.6816, + "theoretical_loss": 3.532322530693971, + "tokens_seen": 1423978496 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002871614844533601, + "loss": 2.9927, + "theoretical_loss": 3.5323080541304126, + "tokens_seen": 1424044032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002871514543630893, + "loss": 2.718, + "theoretical_loss": 3.5322935784195986, + "tokens_seen": 1424109568 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028714142427281846, + "loss": 2.9759, + "theoretical_loss": 3.5322791035614403, + "tokens_seen": 1424175104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028713139418254765, + "loss": 2.8374, + "theoretical_loss": 3.532264629555848, + "tokens_seen": 1424240640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002871213640922768, + "loss": 2.7889, + "theoretical_loss": 3.532250156402732, + "tokens_seen": 1424306176 + }, + { + "epoch": 4.02, + "learning_rate": 0.000287111334002006, + "loss": 2.9429, + "theoretical_loss": 3.5322356841020035, + "tokens_seen": 1424371712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028710130391173524, + "loss": 2.8264, + "theoretical_loss": 3.532221212653573, + "tokens_seen": 1424437248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028709127382146437, + "loss": 2.7143, + "theoretical_loss": 3.5322067420573506, + "tokens_seen": 1424502784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002870812437311936, + "loss": 2.8577, + "theoretical_loss": 3.532192272313247, + "tokens_seen": 1424568320 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028707121364092273, + "loss": 2.7457, + "theoretical_loss": 3.532177803421173, + "tokens_seen": 1424633856 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028706118355065197, + "loss": 2.7619, + "theoretical_loss": 3.53216333538104, + "tokens_seen": 1424699392 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028705115346038115, + "loss": 2.7511, + "theoretical_loss": 3.532148868192757, + "tokens_seen": 1424764928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028704112337011033, + "loss": 2.8973, + "theoretical_loss": 3.532134401856236, + "tokens_seen": 1424830464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002870310932798395, + "loss": 2.8163, + "theoretical_loss": 3.5321199363713873, + "tokens_seen": 1424896000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028702106318956875, + "loss": 2.75, + "theoretical_loss": 3.532105471738121, + "tokens_seen": 1424961536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002870110330992979, + "loss": 2.8023, + "theoretical_loss": 3.532091007956349, + "tokens_seen": 1425027072 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002870010030090271, + "loss": 2.9439, + "theoretical_loss": 3.532076545025981, + "tokens_seen": 1425092608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028699097291875624, + "loss": 2.9403, + "theoretical_loss": 3.532062082946928, + "tokens_seen": 1425158144 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028698094282848547, + "loss": 2.7033, + "theoretical_loss": 3.5320476217191006, + "tokens_seen": 1425223680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028697091273821465, + "loss": 2.7519, + "theoretical_loss": 3.5320331613424103, + "tokens_seen": 1425289216 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2277044, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.92334246635437, + "objective/train/theoretical_loss": 3.5320187018167672, + "objective/train/tokens_used": 1445814752, + "theoretical_loss": 3.5320187018167672, + "tokens_seen": 1425354752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028696088264794383, + "loss": 2.8074, + "theoretical_loss": 3.5320187018167672, + "tokens_seen": 1425354752 + }, + { + "epoch": 4.02, + "learning_rate": 0.000286950852557673, + "loss": 2.8522, + "theoretical_loss": 3.532004243142082, + "tokens_seen": 1425420288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002869408224674022, + "loss": 2.8636, + "theoretical_loss": 3.531989785318266, + "tokens_seen": 1425485824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002869307923771314, + "loss": 2.8917, + "theoretical_loss": 3.5319753283452298, + "tokens_seen": 1425551360 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002869207622868606, + "loss": 2.7169, + "theoretical_loss": 3.531960872222884, + "tokens_seen": 1425616896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028691073219658974, + "loss": 2.712, + "theoretical_loss": 3.53194641695114, + "tokens_seen": 1425682432 + }, + { + "epoch": 4.02, + "learning_rate": 0.000286900702106319, + "loss": 2.9847, + "theoretical_loss": 3.5319319625299075, + "tokens_seen": 1425747968 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868906720160481, + "loss": 2.8452, + "theoretical_loss": 3.531917508959099, + "tokens_seen": 1425813504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028688064192577734, + "loss": 2.8304, + "theoretical_loss": 3.531903056238624, + "tokens_seen": 1425879040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868706118355065, + "loss": 2.8549, + "theoretical_loss": 3.531888604368394, + "tokens_seen": 1425944576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868605817452357, + "loss": 2.7648, + "theoretical_loss": 3.53187415334832, + "tokens_seen": 1426010112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868505516549649, + "loss": 2.9847, + "theoretical_loss": 3.5318597031783123, + "tokens_seen": 1426075648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868405215646941, + "loss": 2.7663, + "theoretical_loss": 3.5318452538582825, + "tokens_seen": 1426141184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028683049147442324, + "loss": 2.7, + "theoretical_loss": 3.5318308053881418, + "tokens_seen": 1426206720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868204613841525, + "loss": 2.7284, + "theoretical_loss": 3.5318163577678003, + "tokens_seen": 1426272256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002868104312938816, + "loss": 2.8234, + "theoretical_loss": 3.5318019109971694, + "tokens_seen": 1426337792 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028680040120361084, + "loss": 2.6841, + "theoretical_loss": 3.53178746507616, + "tokens_seen": 1426403328 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028679037111334, + "loss": 2.857, + "theoretical_loss": 3.531773020004683, + "tokens_seen": 1426468864 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002867803410230692, + "loss": 2.661, + "theoretical_loss": 3.53175857578265, + "tokens_seen": 1426534400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028677031093279844, + "loss": 2.8829, + "theoretical_loss": 3.5317441324099708, + "tokens_seen": 1426599936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028676028084252756, + "loss": 2.7662, + "theoretical_loss": 3.531729689886558, + "tokens_seen": 1426665472 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002867502507522568, + "loss": 2.7831, + "theoretical_loss": 3.5317152482123215, + "tokens_seen": 1426731008 + }, + { + "epoch": 4.02, + "learning_rate": 0.000286740220661986, + "loss": 2.7854, + "theoretical_loss": 3.531700807387173, + "tokens_seen": 1426796544 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028673019057171516, + "loss": 2.7429, + "theoretical_loss": 3.531686367411023, + "tokens_seen": 1426862080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028672016048144434, + "loss": 2.8577, + "theoretical_loss": 3.5316719282837834, + "tokens_seen": 1426927616 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2279872, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9875762462615967, + "objective/train/theoretical_loss": 3.5316574900053643, + "objective/train/tokens_used": 1447453152, + "theoretical_loss": 3.5316574900053643, + "tokens_seen": 1426993152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002867101303911736, + "loss": 2.8582, + "theoretical_loss": 3.5316574900053643, + "tokens_seen": 1426993152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002867001003009027, + "loss": 2.7811, + "theoretical_loss": 3.531643052575678, + "tokens_seen": 1427058688 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028669007021063194, + "loss": 2.789, + "theoretical_loss": 3.5316286159946344, + "tokens_seen": 1427124224 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028668004012036107, + "loss": 2.8336, + "theoretical_loss": 3.531614180262146, + "tokens_seen": 1427189760 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002866700100300903, + "loss": 2.6798, + "theoretical_loss": 3.5315997453781227, + "tokens_seen": 1427255296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002866599799398195, + "loss": 2.7573, + "theoretical_loss": 3.531585311342476, + "tokens_seen": 1427320832 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028664994984954866, + "loss": 2.9167, + "theoretical_loss": 3.5315708781551174, + "tokens_seen": 1427386368 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028663991975927785, + "loss": 2.8411, + "theoretical_loss": 3.531556445815958, + "tokens_seen": 1427451904 + }, + { + "epoch": 4.02, + "learning_rate": 0.000286629889669007, + "loss": 2.7965, + "theoretical_loss": 3.531542014324909, + "tokens_seen": 1427517440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002866198595787362, + "loss": 2.9021, + "theoretical_loss": 3.5315275836818816, + "tokens_seen": 1427582976 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028660982948846544, + "loss": 2.6326, + "theoretical_loss": 3.531513153886787, + "tokens_seen": 1427648512 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028659979939819457, + "loss": 2.9914, + "theoretical_loss": 3.531498724939537, + "tokens_seen": 1427714048 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002865897693079238, + "loss": 2.6175, + "theoretical_loss": 3.5314842968400417, + "tokens_seen": 1427779584 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028657973921765293, + "loss": 2.8761, + "theoretical_loss": 3.5314698695882134, + "tokens_seen": 1427845120 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028656970912738217, + "loss": 2.7411, + "theoretical_loss": 3.5314554431839627, + "tokens_seen": 1427910656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028655967903711135, + "loss": 2.9904, + "theoretical_loss": 3.531441017627202, + "tokens_seen": 1427976192 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028654964894684053, + "loss": 2.7186, + "theoretical_loss": 3.5314265929178412, + "tokens_seen": 1428041728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002865396188565697, + "loss": 2.7108, + "theoretical_loss": 3.5314121690557925, + "tokens_seen": 1428107264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028652958876629895, + "loss": 2.7069, + "theoretical_loss": 3.531397746040967, + "tokens_seen": 1428172800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002865195586760281, + "loss": 2.9382, + "theoretical_loss": 3.5313833238732766, + "tokens_seen": 1428238336 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002865095285857573, + "loss": 3.0035, + "theoretical_loss": 3.531368902552632, + "tokens_seen": 1428303872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028649949849548644, + "loss": 2.8726, + "theoretical_loss": 3.531354482078944, + "tokens_seen": 1428369408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028648946840521567, + "loss": 2.7851, + "theoretical_loss": 3.531340062452126, + "tokens_seen": 1428434944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028647943831494485, + "loss": 2.6045, + "theoretical_loss": 3.531325643672088, + "tokens_seen": 1428500480 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028646940822467403, + "loss": 2.8368, + "theoretical_loss": 3.531311225738741, + "tokens_seen": 1428566016 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2281301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.879443407058716, + "objective/train/theoretical_loss": 3.531296808651997, + "objective/train/tokens_used": 1449091552, + "theoretical_loss": 3.531296808651997, + "tokens_seen": 1428631552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002864593781344032, + "loss": 2.9238, + "theoretical_loss": 3.531296808651997, + "tokens_seen": 1428631552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002864493480441324, + "loss": 2.9801, + "theoretical_loss": 3.5312823924117684, + "tokens_seen": 1428697088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002864393179538616, + "loss": 2.6452, + "theoretical_loss": 3.5312679770179654, + "tokens_seen": 1428762624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002864292878635908, + "loss": 2.8531, + "theoretical_loss": 3.5312535624705, + "tokens_seen": 1428828160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028641925777331994, + "loss": 2.6621, + "theoretical_loss": 3.5312391487692834, + "tokens_seen": 1428893696 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002864092276830492, + "loss": 2.6029, + "theoretical_loss": 3.5312247359142273, + "tokens_seen": 1428959232 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863991975927783, + "loss": 2.7046, + "theoretical_loss": 3.5312103239052433, + "tokens_seen": 1429024768 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028638916750250754, + "loss": 2.8025, + "theoretical_loss": 3.531195912742243, + "tokens_seen": 1429090304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863791374122367, + "loss": 2.7885, + "theoretical_loss": 3.5311815024251376, + "tokens_seen": 1429155840 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863691073219659, + "loss": 2.8742, + "theoretical_loss": 3.5311670929538392, + "tokens_seen": 1429221376 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863590772316951, + "loss": 2.7876, + "theoretical_loss": 3.5311526843282586, + "tokens_seen": 1429286912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863490471414243, + "loss": 2.869, + "theoretical_loss": 3.531138276548308, + "tokens_seen": 1429352448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028633901705115344, + "loss": 2.8216, + "theoretical_loss": 3.531123869613899, + "tokens_seen": 1429417984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863289869608827, + "loss": 2.8775, + "theoretical_loss": 3.5311094635249427, + "tokens_seen": 1429483520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002863189568706118, + "loss": 3.0474, + "theoretical_loss": 3.531095058281351, + "tokens_seen": 1429549056 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028630892678034104, + "loss": 2.9038, + "theoretical_loss": 3.531080653883036, + "tokens_seen": 1429614592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862988966900702, + "loss": 2.7394, + "theoretical_loss": 3.531066250329909, + "tokens_seen": 1429680128 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862888665997994, + "loss": 2.8981, + "theoretical_loss": 3.5310518476218817, + "tokens_seen": 1429745664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862788365095286, + "loss": 2.7326, + "theoretical_loss": 3.5310374457588654, + "tokens_seen": 1429811200 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028626880641925776, + "loss": 2.8245, + "theoretical_loss": 3.531023044740772, + "tokens_seen": 1429876736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028625877632898694, + "loss": 2.7719, + "theoretical_loss": 3.531008644567514, + "tokens_seen": 1429942272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862487462387162, + "loss": 2.8893, + "theoretical_loss": 3.5309942452390017, + "tokens_seen": 1430007808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862387161484453, + "loss": 2.8015, + "theoretical_loss": 3.530979846755148, + "tokens_seen": 1430073344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028622868605817454, + "loss": 2.7479, + "theoretical_loss": 3.5309654491158637, + "tokens_seen": 1430138880 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028621865596790367, + "loss": 2.794, + "theoretical_loss": 3.530951052321062, + "tokens_seen": 1430204416 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2284182, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8437345027923584, + "objective/train/theoretical_loss": 3.5309366563706535, + "objective/train/tokens_used": 1450729952, + "theoretical_loss": 3.5309366563706535, + "tokens_seen": 1430269952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002862086258776329, + "loss": 2.7713, + "theoretical_loss": 3.5309366563706535, + "tokens_seen": 1430269952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002861985957873621, + "loss": 2.8971, + "theoretical_loss": 3.53092226126455, + "tokens_seen": 1430335488 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028618856569709127, + "loss": 2.7774, + "theoretical_loss": 3.5309078670026635, + "tokens_seen": 1430401024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028617853560682045, + "loss": 2.8968, + "theoretical_loss": 3.530893473584906, + "tokens_seen": 1430466560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002861685055165497, + "loss": 2.8448, + "theoretical_loss": 3.530879081011189, + "tokens_seen": 1430532096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002861584754262788, + "loss": 2.7505, + "theoretical_loss": 3.530864689281425, + "tokens_seen": 1430597632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028614844533600805, + "loss": 2.9158, + "theoretical_loss": 3.5308502983955248, + "tokens_seen": 1430663168 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028613841524573717, + "loss": 2.7696, + "theoretical_loss": 3.530835908353401, + "tokens_seen": 1430728704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002861283851554664, + "loss": 2.6454, + "theoretical_loss": 3.5308215191549657, + "tokens_seen": 1430794240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002861183550651956, + "loss": 2.6937, + "theoretical_loss": 3.53080713080013, + "tokens_seen": 1430859776 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028610832497492477, + "loss": 2.7579, + "theoretical_loss": 3.5307927432888064, + "tokens_seen": 1430925312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028609829488465395, + "loss": 2.8369, + "theoretical_loss": 3.5307783566209068, + "tokens_seen": 1430990848 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028608826479438313, + "loss": 2.8636, + "theoretical_loss": 3.530763970796343, + "tokens_seen": 1431056384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860782347041123, + "loss": 2.732, + "theoretical_loss": 3.530749585815027, + "tokens_seen": 1431121920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028606820461384155, + "loss": 2.772, + "theoretical_loss": 3.5307352016768707, + "tokens_seen": 1431187456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860581745235707, + "loss": 2.7548, + "theoretical_loss": 3.530720818381786, + "tokens_seen": 1431252992 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860481444332999, + "loss": 2.8056, + "theoretical_loss": 3.5307064359296847, + "tokens_seen": 1431318528 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860381143430291, + "loss": 2.8779, + "theoretical_loss": 3.530692054320479, + "tokens_seen": 1431384064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860280842527583, + "loss": 2.6319, + "theoretical_loss": 3.5306776735540817, + "tokens_seen": 1431449600 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002860180541624875, + "loss": 2.73, + "theoretical_loss": 3.530663293630403, + "tokens_seen": 1431515136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028600802407221664, + "loss": 2.6967, + "theoretical_loss": 3.530648914549357, + "tokens_seen": 1431580672 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028599799398194587, + "loss": 2.8452, + "theoretical_loss": 3.5306345363108544, + "tokens_seen": 1431646208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028598796389167505, + "loss": 2.6595, + "theoretical_loss": 3.530620158914808, + "tokens_seen": 1431711744 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028597793380140423, + "loss": 2.7543, + "theoretical_loss": 3.530605782361129, + "tokens_seen": 1431777280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002859679037111334, + "loss": 2.7476, + "theoretical_loss": 3.53059140664973, + "tokens_seen": 1431842816 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2287003, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.888629913330078, + "objective/train/theoretical_loss": 3.5305770317805236, + "objective/train/tokens_used": 1452368352, + "theoretical_loss": 3.5305770317805236, + "tokens_seen": 1431908352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002859578736208626, + "loss": 2.8947, + "theoretical_loss": 3.5305770317805236, + "tokens_seen": 1431908352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002859478435305918, + "loss": 2.9043, + "theoretical_loss": 3.5305626577534213, + "tokens_seen": 1431973888 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285937813440321, + "loss": 2.778, + "theoretical_loss": 3.5305482845683356, + "tokens_seen": 1432039424 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028592778335005014, + "loss": 2.9429, + "theoretical_loss": 3.530533912225178, + "tokens_seen": 1432104960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002859177532597794, + "loss": 2.8741, + "theoretical_loss": 3.530519540723861, + "tokens_seen": 1432170496 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002859077231695085, + "loss": 2.7997, + "theoretical_loss": 3.5305051700642975, + "tokens_seen": 1432236032 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028589769307923774, + "loss": 2.7032, + "theoretical_loss": 3.5304908002463984, + "tokens_seen": 1432301568 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858876629889669, + "loss": 2.9355, + "theoretical_loss": 3.5304764312700767, + "tokens_seen": 1432367104 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858776328986961, + "loss": 2.6978, + "theoretical_loss": 3.5304620631352446, + "tokens_seen": 1432432640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858676028084253, + "loss": 2.9225, + "theoretical_loss": 3.530447695841814, + "tokens_seen": 1432498176 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858575727181545, + "loss": 2.7833, + "theoretical_loss": 3.5304333293896972, + "tokens_seen": 1432563712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028584754262788364, + "loss": 2.8082, + "theoretical_loss": 3.530418963778807, + "tokens_seen": 1432629248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858375125376129, + "loss": 2.7884, + "theoretical_loss": 3.5304045990090547, + "tokens_seen": 1432694784 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285827482447342, + "loss": 2.8171, + "theoretical_loss": 3.5303902350803535, + "tokens_seen": 1432760320 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028581745235707124, + "loss": 2.7614, + "theoretical_loss": 3.5303758719926153, + "tokens_seen": 1432825856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002858074222668004, + "loss": 2.7786, + "theoretical_loss": 3.5303615097457524, + "tokens_seen": 1432891392 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857973921765296, + "loss": 2.8618, + "theoretical_loss": 3.530347148339677, + "tokens_seen": 1432956928 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857873620862588, + "loss": 2.8463, + "theoretical_loss": 3.5303327877743014, + "tokens_seen": 1433022464 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028577733199598796, + "loss": 2.6777, + "theoretical_loss": 3.5303184280495383, + "tokens_seen": 1433088000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028576730190571714, + "loss": 2.8414, + "theoretical_loss": 3.5303040691652994, + "tokens_seen": 1433153536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857572718154464, + "loss": 2.9106, + "theoretical_loss": 3.530289711121498, + "tokens_seen": 1433219072 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857472417251755, + "loss": 2.9135, + "theoretical_loss": 3.5302753539180456, + "tokens_seen": 1433284608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028573721163490474, + "loss": 2.8697, + "theoretical_loss": 3.5302609975548553, + "tokens_seen": 1433350144 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028572718154463387, + "loss": 2.7177, + "theoretical_loss": 3.5302466420318392, + "tokens_seen": 1433415680 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857171514543631, + "loss": 2.8367, + "theoretical_loss": 3.5302322873489094, + "tokens_seen": 1433481216 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2289434, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6822054386138916, + "objective/train/theoretical_loss": 3.5302179335059787, + "objective/train/tokens_used": 1454006752, + "theoretical_loss": 3.5302179335059787, + "tokens_seen": 1433546752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002857071213640923, + "loss": 2.7691, + "theoretical_loss": 3.5302179335059787, + "tokens_seen": 1433546752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028569709127382147, + "loss": 2.857, + "theoretical_loss": 3.5302035805029597, + "tokens_seen": 1433612288 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028568706118355065, + "loss": 2.7548, + "theoretical_loss": 3.530189228339765, + "tokens_seen": 1433677824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002856770310932799, + "loss": 2.8354, + "theoretical_loss": 3.5301748770163055, + "tokens_seen": 1433743360 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285667001003009, + "loss": 2.8917, + "theoretical_loss": 3.5301605265324962, + "tokens_seen": 1433808896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028565697091273825, + "loss": 2.8337, + "theoretical_loss": 3.5301461768882474, + "tokens_seen": 1433874432 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028564694082246737, + "loss": 2.7434, + "theoretical_loss": 3.530131828083473, + "tokens_seen": 1433939968 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002856369107321966, + "loss": 2.779, + "theoretical_loss": 3.530117480118085, + "tokens_seen": 1434005504 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002856268806419258, + "loss": 2.8562, + "theoretical_loss": 3.530103132991996, + "tokens_seen": 1434071040 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028561685055165497, + "loss": 2.903, + "theoretical_loss": 3.5300887867051185, + "tokens_seen": 1434136576 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028560682046138415, + "loss": 2.9265, + "theoretical_loss": 3.530074441257365, + "tokens_seen": 1434202112 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028559679037111333, + "loss": 2.95, + "theoretical_loss": 3.5300600966486484, + "tokens_seen": 1434267648 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002855867602808425, + "loss": 2.7286, + "theoretical_loss": 3.5300457528788813, + "tokens_seen": 1434333184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028557673019057175, + "loss": 2.9706, + "theoretical_loss": 3.5300314099479753, + "tokens_seen": 1434398720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002855667001003009, + "loss": 2.7958, + "theoretical_loss": 3.5300170678558445, + "tokens_seen": 1434464256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002855566700100301, + "loss": 2.9186, + "theoretical_loss": 3.5300027266024006, + "tokens_seen": 1434529792 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002855466399197593, + "loss": 2.7941, + "theoretical_loss": 3.529988386187556, + "tokens_seen": 1434595328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002855366098294885, + "loss": 2.8254, + "theoretical_loss": 3.5299740466112244, + "tokens_seen": 1434660864 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028552657973921765, + "loss": 2.7911, + "theoretical_loss": 3.529959707873318, + "tokens_seen": 1434726400 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028551654964894684, + "loss": 2.7296, + "theoretical_loss": 3.529945369973749, + "tokens_seen": 1434791936 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285506519558676, + "loss": 2.9884, + "theoretical_loss": 3.529931032912431, + "tokens_seen": 1434857472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028549648946840525, + "loss": 2.6801, + "theoretical_loss": 3.529916696689276, + "tokens_seen": 1434923008 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002854864593781344, + "loss": 2.8708, + "theoretical_loss": 3.5299023613041967, + "tokens_seen": 1434988544 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002854764292878636, + "loss": 2.8949, + "theoretical_loss": 3.529888026757106, + "tokens_seen": 1435054080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028546639919759274, + "loss": 2.9597, + "theoretical_loss": 3.529873693047917, + "tokens_seen": 1435119616 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2292216, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9003849029541016, + "objective/train/theoretical_loss": 3.529859360176542, + "objective/train/tokens_used": 1455645152, + "theoretical_loss": 3.529859360176542, + "tokens_seen": 1435185152 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285456369107322, + "loss": 2.8575, + "theoretical_loss": 3.529859360176542, + "tokens_seen": 1435185152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028544633901705116, + "loss": 2.8455, + "theoretical_loss": 3.5298450281428937, + "tokens_seen": 1435250688 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028543630892678034, + "loss": 2.8273, + "theoretical_loss": 3.5298306969468856, + "tokens_seen": 1435316224 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002854262788365095, + "loss": 2.8132, + "theoretical_loss": 3.5298163665884297, + "tokens_seen": 1435381760 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002854162487462387, + "loss": 2.7918, + "theoretical_loss": 3.529802037067439, + "tokens_seen": 1435447296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002854062186559679, + "loss": 2.8433, + "theoretical_loss": 3.529787708383827, + "tokens_seen": 1435512832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002853961885656971, + "loss": 2.9524, + "theoretical_loss": 3.529773380537505, + "tokens_seen": 1435578368 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028538615847542624, + "loss": 2.8007, + "theoretical_loss": 3.529759053528388, + "tokens_seen": 1435643904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002853761283851555, + "loss": 2.8064, + "theoretical_loss": 3.529744727356387, + "tokens_seen": 1435709440 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028536609829488466, + "loss": 2.8769, + "theoretical_loss": 3.529730402021416, + "tokens_seen": 1435774976 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028535606820461384, + "loss": 2.8106, + "theoretical_loss": 3.5297160775233873, + "tokens_seen": 1435840512 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285346038114343, + "loss": 2.7904, + "theoretical_loss": 3.5297017538622137, + "tokens_seen": 1435906048 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002853360080240722, + "loss": 2.8224, + "theoretical_loss": 3.529687431037809, + "tokens_seen": 1435971584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002853259779338014, + "loss": 2.7804, + "theoretical_loss": 3.5296731090500852, + "tokens_seen": 1436037120 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002853159478435306, + "loss": 2.9875, + "theoretical_loss": 3.529658787898956, + "tokens_seen": 1436102656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028530591775325975, + "loss": 2.8652, + "theoretical_loss": 3.5296444675843333, + "tokens_seen": 1436168192 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285295887662989, + "loss": 2.8863, + "theoretical_loss": 3.5296301481061314, + "tokens_seen": 1436233728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002852858575727181, + "loss": 2.7792, + "theoretical_loss": 3.5296158294642623, + "tokens_seen": 1436299264 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028527582748244735, + "loss": 2.8541, + "theoretical_loss": 3.5296015116586394, + "tokens_seen": 1436364800 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002852657973921766, + "loss": 2.9591, + "theoretical_loss": 3.5295871946891753, + "tokens_seen": 1436430336 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002852557673019057, + "loss": 2.8561, + "theoretical_loss": 3.5295728785557836, + "tokens_seen": 1436495872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028524573721163494, + "loss": 2.7772, + "theoretical_loss": 3.529558563258377, + "tokens_seen": 1436561408 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028523570712136407, + "loss": 3.0168, + "theoretical_loss": 3.529544248796869, + "tokens_seen": 1436626944 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002852256770310933, + "loss": 2.8834, + "theoretical_loss": 3.5295299351711718, + "tokens_seen": 1436692480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002852156469408225, + "loss": 2.7837, + "theoretical_loss": 3.529515622381199, + "tokens_seen": 1436758016 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2294933, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7038068771362305, + "objective/train/theoretical_loss": 3.5295013104268635, + "objective/train/tokens_used": 1457283552, + "theoretical_loss": 3.5295013104268635, + "tokens_seen": 1436823552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028520561685055167, + "loss": 2.804, + "theoretical_loss": 3.5295013104268635, + "tokens_seen": 1436823552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028519558676028085, + "loss": 2.8556, + "theoretical_loss": 3.529486999308079, + "tokens_seen": 1436889088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002851855566700101, + "loss": 2.9405, + "theoretical_loss": 3.5294726890247574, + "tokens_seen": 1436954624 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002851755265797392, + "loss": 2.8133, + "theoretical_loss": 3.5294583795768135, + "tokens_seen": 1437020160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028516549648946845, + "loss": 2.8439, + "theoretical_loss": 3.529444070964159, + "tokens_seen": 1437085696 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028515546639919757, + "loss": 2.8152, + "theoretical_loss": 3.529429763186708, + "tokens_seen": 1437151232 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002851454363089268, + "loss": 2.7506, + "theoretical_loss": 3.529415456244373, + "tokens_seen": 1437216768 + }, + { + "epoch": 4.02, + "learning_rate": 0.000285135406218656, + "loss": 2.9513, + "theoretical_loss": 3.5294011501370672, + "tokens_seen": 1437282304 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028512537612838517, + "loss": 2.7726, + "theoretical_loss": 3.529386844864704, + "tokens_seen": 1437347840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028511534603811435, + "loss": 2.7593, + "theoretical_loss": 3.5293725404271967, + "tokens_seen": 1437413376 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028510531594784353, + "loss": 2.8667, + "theoretical_loss": 3.529358236824459, + "tokens_seen": 1437478912 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850952858575727, + "loss": 2.9677, + "theoretical_loss": 3.529343934056403, + "tokens_seen": 1437544448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028508525576730195, + "loss": 2.9136, + "theoretical_loss": 3.5293296321229426, + "tokens_seen": 1437609984 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850752256770311, + "loss": 2.7811, + "theoretical_loss": 3.529315331023991, + "tokens_seen": 1437675520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850651955867603, + "loss": 2.7394, + "theoretical_loss": 3.529301030759461, + "tokens_seen": 1437741056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850551654964895, + "loss": 2.9202, + "theoretical_loss": 3.529286731329267, + "tokens_seen": 1437806592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850451354062187, + "loss": 2.6244, + "theoretical_loss": 3.5292724327333214, + "tokens_seen": 1437872128 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028503510531594785, + "loss": 2.7884, + "theoretical_loss": 3.5292581349715375, + "tokens_seen": 1437937664 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028502507522567704, + "loss": 2.7897, + "theoretical_loss": 3.529243838043829, + "tokens_seen": 1438003200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002850150451354062, + "loss": 2.8664, + "theoretical_loss": 3.529229541950109, + "tokens_seen": 1438068736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028500501504513545, + "loss": 2.88, + "theoretical_loss": 3.529215246690291, + "tokens_seen": 1438134272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849949849548646, + "loss": 2.9544, + "theoretical_loss": 3.529200952264288, + "tokens_seen": 1438199808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849849548645938, + "loss": 2.7702, + "theoretical_loss": 3.5291866586720144, + "tokens_seen": 1438265344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028497492477432294, + "loss": 2.7379, + "theoretical_loss": 3.5291723659133822, + "tokens_seen": 1438330880 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849648946840522, + "loss": 2.7449, + "theoretical_loss": 3.529158073988306, + "tokens_seen": 1438396416 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2297709, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6159000396728516, + "objective/train/theoretical_loss": 3.529143782896698, + "objective/train/tokens_used": 1458921952, + "theoretical_loss": 3.529143782896698, + "tokens_seen": 1438461952 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028495486459378136, + "loss": 2.7605, + "theoretical_loss": 3.529143782896698, + "tokens_seen": 1438461952 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028494483450351054, + "loss": 2.8105, + "theoretical_loss": 3.529129492638473, + "tokens_seen": 1438527488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849348044132397, + "loss": 2.9701, + "theoretical_loss": 3.529115203213543, + "tokens_seen": 1438593024 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849247743229689, + "loss": 2.8093, + "theoretical_loss": 3.5291009146218224, + "tokens_seen": 1438658560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849147442326981, + "loss": 2.784, + "theoretical_loss": 3.5290866268632244, + "tokens_seen": 1438724096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002849047141424273, + "loss": 2.7859, + "theoretical_loss": 3.529072339937663, + "tokens_seen": 1438789632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028489468405215644, + "loss": 2.7893, + "theoretical_loss": 3.5290580538450502, + "tokens_seen": 1438855168 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848846539618857, + "loss": 2.7884, + "theoretical_loss": 3.5290437685853013, + "tokens_seen": 1438920704 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028487462387161486, + "loss": 2.7918, + "theoretical_loss": 3.529029484158329, + "tokens_seen": 1438986240 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028486459378134404, + "loss": 2.8049, + "theoretical_loss": 3.5290152005640465, + "tokens_seen": 1439051776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848545636910732, + "loss": 2.7595, + "theoretical_loss": 3.529000917802368, + "tokens_seen": 1439117312 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848445336008024, + "loss": 2.8512, + "theoretical_loss": 3.5289866358732063, + "tokens_seen": 1439182848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848345035105316, + "loss": 2.7769, + "theoretical_loss": 3.5289723547764758, + "tokens_seen": 1439248384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848244734202608, + "loss": 2.7628, + "theoretical_loss": 3.5289580745120896, + "tokens_seen": 1439313920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028481444332998995, + "loss": 2.7772, + "theoretical_loss": 3.528943795079961, + "tokens_seen": 1439379456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002848044132397192, + "loss": 2.8174, + "theoretical_loss": 3.5289295164800043, + "tokens_seen": 1439444992 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847943831494483, + "loss": 2.9782, + "theoretical_loss": 3.528915238712133, + "tokens_seen": 1439510528 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028478435305917755, + "loss": 2.9234, + "theoretical_loss": 3.5289009617762606, + "tokens_seen": 1439576064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847743229689067, + "loss": 2.9181, + "theoretical_loss": 3.5288866856723002, + "tokens_seen": 1439641600 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847642928786359, + "loss": 2.7384, + "theoretical_loss": 3.528872410400166, + "tokens_seen": 1439707136 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847542627883651, + "loss": 2.8156, + "theoretical_loss": 3.528858135959772, + "tokens_seen": 1439772672 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028474423269809427, + "loss": 2.8263, + "theoretical_loss": 3.5288438623510316, + "tokens_seen": 1439838208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028473420260782345, + "loss": 2.8572, + "theoretical_loss": 3.528829589573858, + "tokens_seen": 1439903744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847241725175527, + "loss": 2.8249, + "theoretical_loss": 3.5288153176281654, + "tokens_seen": 1439969280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002847141424272818, + "loss": 2.9553, + "theoretical_loss": 3.528801046513867, + "tokens_seen": 1440034816 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2300538, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.710258722305298, + "objective/train/theoretical_loss": 3.5287867762308776, + "objective/train/tokens_used": 1460560352, + "theoretical_loss": 3.5287867762308776, + "tokens_seen": 1440100352 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028470411233701105, + "loss": 2.9145, + "theoretical_loss": 3.5287867762308776, + "tokens_seen": 1440100352 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028469408224674023, + "loss": 2.8564, + "theoretical_loss": 3.52877250677911, + "tokens_seen": 1440165888 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846840521564694, + "loss": 2.6641, + "theoretical_loss": 3.528758238158478, + "tokens_seen": 1440231424 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846740220661986, + "loss": 2.8783, + "theoretical_loss": 3.528743970368896, + "tokens_seen": 1440296960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846639919759278, + "loss": 2.7808, + "theoretical_loss": 3.5287297034102774, + "tokens_seen": 1440362496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028465396188565695, + "loss": 2.7852, + "theoretical_loss": 3.528715437282536, + "tokens_seen": 1440428032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846439317953862, + "loss": 2.7956, + "theoretical_loss": 3.5287011719855856, + "tokens_seen": 1440493568 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846339017051153, + "loss": 2.836, + "theoretical_loss": 3.52868690751934, + "tokens_seen": 1440559104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028462387161484455, + "loss": 2.6612, + "theoretical_loss": 3.528672643883713, + "tokens_seen": 1440624640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846138415245737, + "loss": 2.956, + "theoretical_loss": 3.5286583810786185, + "tokens_seen": 1440690176 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002846038114343029, + "loss": 2.9342, + "theoretical_loss": 3.528644119103971, + "tokens_seen": 1440755712 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845937813440321, + "loss": 2.8002, + "theoretical_loss": 3.5286298579596833, + "tokens_seen": 1440821248 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845837512537613, + "loss": 2.7457, + "theoretical_loss": 3.5286155976456697, + "tokens_seen": 1440886784 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028457372116349046, + "loss": 2.9904, + "theoretical_loss": 3.5286013381618444, + "tokens_seen": 1440952320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845636910732197, + "loss": 2.7924, + "theoretical_loss": 3.528587079508121, + "tokens_seen": 1441017856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845536609829488, + "loss": 2.7623, + "theoretical_loss": 3.5285728216844134, + "tokens_seen": 1441083392 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028454363089267805, + "loss": 2.7711, + "theoretical_loss": 3.528558564690636, + "tokens_seen": 1441148928 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028453360080240724, + "loss": 2.9197, + "theoretical_loss": 3.528544308526702, + "tokens_seen": 1441214464 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845235707121364, + "loss": 2.8816, + "theoretical_loss": 3.528530053192526, + "tokens_seen": 1441280000 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028451354062186565, + "loss": 2.9005, + "theoretical_loss": 3.528515798688021, + "tokens_seen": 1441345536 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002845035105315948, + "loss": 2.8075, + "theoretical_loss": 3.5285015450131025, + "tokens_seen": 1441411072 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284493480441324, + "loss": 2.835, + "theoretical_loss": 3.528487292167684, + "tokens_seen": 1441476608 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028448345035105314, + "loss": 2.8881, + "theoretical_loss": 3.5284730401516793, + "tokens_seen": 1441542144 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002844734202607824, + "loss": 2.8282, + "theoretical_loss": 3.5284587889650014, + "tokens_seen": 1441607680 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028446339017051156, + "loss": 2.8889, + "theoretical_loss": 3.5284445386075665, + "tokens_seen": 1441673216 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2303233, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6194636821746826, + "objective/train/theoretical_loss": 3.528430289079287, + "objective/train/tokens_used": 1462198752, + "theoretical_loss": 3.528430289079287, + "tokens_seen": 1441738752 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028445336008024074, + "loss": 2.8566, + "theoretical_loss": 3.528430289079287, + "tokens_seen": 1441738752 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002844433299899699, + "loss": 2.8953, + "theoretical_loss": 3.528416040380077, + "tokens_seen": 1441804288 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002844332998996991, + "loss": 2.8465, + "theoretical_loss": 3.5284017925098516, + "tokens_seen": 1441869824 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002844232698094283, + "loss": 2.8799, + "theoretical_loss": 3.528387545468524, + "tokens_seen": 1441935360 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002844132397191575, + "loss": 2.8442, + "theoretical_loss": 3.5283732992560095, + "tokens_seen": 1442000896 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028440320962888664, + "loss": 2.7224, + "theoretical_loss": 3.5283590538722205, + "tokens_seen": 1442066432 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843931795386159, + "loss": 2.8804, + "theoretical_loss": 3.528344809317072, + "tokens_seen": 1442131968 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028438314944834506, + "loss": 2.7963, + "theoretical_loss": 3.528330565590479, + "tokens_seen": 1442197504 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028437311935807424, + "loss": 2.7106, + "theoretical_loss": 3.528316322692354, + "tokens_seen": 1442263040 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843630892678034, + "loss": 2.8809, + "theoretical_loss": 3.5283020806226117, + "tokens_seen": 1442328576 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843530591775326, + "loss": 2.9417, + "theoretical_loss": 3.5282878393811674, + "tokens_seen": 1442394112 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843430290872618, + "loss": 2.8115, + "theoretical_loss": 3.528273598967934, + "tokens_seen": 1442459648 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284332998996991, + "loss": 2.8832, + "theoretical_loss": 3.528259359382826, + "tokens_seen": 1442525184 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028432296890672015, + "loss": 2.7325, + "theoretical_loss": 3.5282451206257583, + "tokens_seen": 1442590720 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843129388164494, + "loss": 2.7047, + "theoretical_loss": 3.5282308826966444, + "tokens_seen": 1442656256 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002843029087261785, + "loss": 2.6879, + "theoretical_loss": 3.528216645595399, + "tokens_seen": 1442721792 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028429287863590775, + "loss": 2.8531, + "theoretical_loss": 3.5282024093219357, + "tokens_seen": 1442787328 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002842828485456369, + "loss": 2.8532, + "theoretical_loss": 3.528188173876169, + "tokens_seen": 1442852864 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002842728184553661, + "loss": 2.8656, + "theoretical_loss": 3.528173939258014, + "tokens_seen": 1442918400 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002842627883650953, + "loss": 2.7635, + "theoretical_loss": 3.528159705467384, + "tokens_seen": 1442983936 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028425275827482447, + "loss": 2.807, + "theoretical_loss": 3.528145472504194, + "tokens_seen": 1443049472 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028424272818455365, + "loss": 2.9479, + "theoretical_loss": 3.528131240368358, + "tokens_seen": 1443115008 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002842326980942829, + "loss": 2.7516, + "theoretical_loss": 3.52811700905979, + "tokens_seen": 1443180544 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284222668004012, + "loss": 2.6379, + "theoretical_loss": 3.528102778578405, + "tokens_seen": 1443246080 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028421263791374125, + "loss": 2.7874, + "theoretical_loss": 3.528088548924117, + "tokens_seen": 1443311616 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2304576, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.783816337585449, + "objective/train/theoretical_loss": 3.5280743200968403, + "objective/train/tokens_used": 1463837152, + "theoretical_loss": 3.5280743200968403, + "tokens_seen": 1443377152 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028420260782347043, + "loss": 2.8624, + "theoretical_loss": 3.5280743200968403, + "tokens_seen": 1443377152 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841925777331996, + "loss": 2.9028, + "theoretical_loss": 3.5280600920964895, + "tokens_seen": 1443442688 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841825476429288, + "loss": 2.8163, + "theoretical_loss": 3.528045864922979, + "tokens_seen": 1443508224 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284172517552658, + "loss": 2.9494, + "theoretical_loss": 3.5280316385762234, + "tokens_seen": 1443573760 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028416248746238715, + "loss": 2.9119, + "theoretical_loss": 3.5280174130561366, + "tokens_seen": 1443639296 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841524573721164, + "loss": 2.8607, + "theoretical_loss": 3.528003188362633, + "tokens_seen": 1443704832 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841424272818455, + "loss": 2.7763, + "theoretical_loss": 3.527988964495628, + "tokens_seen": 1443770368 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028413239719157475, + "loss": 2.7994, + "theoretical_loss": 3.5279747414550355, + "tokens_seen": 1443835904 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841223671013039, + "loss": 2.9002, + "theoretical_loss": 3.5279605192407697, + "tokens_seen": 1443901440 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841123370110331, + "loss": 2.8693, + "theoretical_loss": 3.527946297852745, + "tokens_seen": 1443966976 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002841023069207623, + "loss": 2.6789, + "theoretical_loss": 3.527932077290876, + "tokens_seen": 1444032512 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002840922768304915, + "loss": 2.9359, + "theoretical_loss": 3.527917857555078, + "tokens_seen": 1444098048 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028408224674022066, + "loss": 2.7574, + "theoretical_loss": 3.527903638645265, + "tokens_seen": 1444163584 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002840722166499499, + "loss": 2.8415, + "theoretical_loss": 3.527889420561351, + "tokens_seen": 1444229120 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284062186559679, + "loss": 3.0718, + "theoretical_loss": 3.5278752033032514, + "tokens_seen": 1444294656 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028405215646940825, + "loss": 2.8914, + "theoretical_loss": 3.52786098687088, + "tokens_seen": 1444360192 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002840421263791374, + "loss": 2.8145, + "theoretical_loss": 3.5278467712641524, + "tokens_seen": 1444425728 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002840320962888666, + "loss": 2.7111, + "theoretical_loss": 3.5278325564829824, + "tokens_seen": 1444491264 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002840220661985958, + "loss": 2.8987, + "theoretical_loss": 3.5278183425272847, + "tokens_seen": 1444556800 + }, + { + "epoch": 4.02, + "learning_rate": 0.000284012036108325, + "loss": 2.876, + "theoretical_loss": 3.5278041293969737, + "tokens_seen": 1444622336 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028400200601805416, + "loss": 2.8526, + "theoretical_loss": 3.5277899170919644, + "tokens_seen": 1444687872 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028399197592778334, + "loss": 2.7494, + "theoretical_loss": 3.5277757056121715, + "tokens_seen": 1444753408 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002839819458375125, + "loss": 2.6676, + "theoretical_loss": 3.5277614949575096, + "tokens_seen": 1444818944 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028397191574724176, + "loss": 2.8197, + "theoretical_loss": 3.5277472851278935, + "tokens_seen": 1444884480 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002839618856569709, + "loss": 2.8476, + "theoretical_loss": 3.5277330761232375, + "tokens_seen": 1444950016 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2307316, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.64445161819458, + "objective/train/theoretical_loss": 3.5277188679434563, + "objective/train/tokens_used": 1465475552, + "theoretical_loss": 3.5277188679434563, + "tokens_seen": 1445015552 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002839518555667001, + "loss": 2.8799, + "theoretical_loss": 3.5277188679434563, + "tokens_seen": 1445015552 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028394182547642925, + "loss": 2.7585, + "theoretical_loss": 3.5277046605884648, + "tokens_seen": 1445081088 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002839317953861585, + "loss": 2.8205, + "theoretical_loss": 3.5276904540581784, + "tokens_seen": 1445146624 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028392176529588766, + "loss": 2.7188, + "theoretical_loss": 3.52767624835251, + "tokens_seen": 1445212160 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028391173520561684, + "loss": 2.9999, + "theoretical_loss": 3.527662043471376, + "tokens_seen": 1445277696 + }, + { + "epoch": 4.02, + "learning_rate": 0.000283901705115346, + "loss": 2.7441, + "theoretical_loss": 3.5276478394146906, + "tokens_seen": 1445343232 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028389167502507526, + "loss": 2.7036, + "theoretical_loss": 3.527633636182369, + "tokens_seen": 1445408768 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002838816449348044, + "loss": 2.683, + "theoretical_loss": 3.527619433774325, + "tokens_seen": 1445474304 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002838716148445336, + "loss": 2.8133, + "theoretical_loss": 3.5276052321904743, + "tokens_seen": 1445539840 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028386158475426275, + "loss": 2.7236, + "theoretical_loss": 3.527591031430731, + "tokens_seen": 1445605376 + }, + { + "epoch": 4.02, + "learning_rate": 0.000283851554663992, + "loss": 2.7651, + "theoretical_loss": 3.527576831495011, + "tokens_seen": 1445670912 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028384152457372117, + "loss": 2.7502, + "theoretical_loss": 3.527562632383228, + "tokens_seen": 1445736448 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028383149448345035, + "loss": 2.843, + "theoretical_loss": 3.527548434095297, + "tokens_seen": 1445801984 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028382146439317953, + "loss": 2.7918, + "theoretical_loss": 3.5275342366311335, + "tokens_seen": 1445867520 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002838114343029087, + "loss": 2.681, + "theoretical_loss": 3.527520039990652, + "tokens_seen": 1445933056 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002838014042126379, + "loss": 2.795, + "theoretical_loss": 3.5275058441737674, + "tokens_seen": 1445998592 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837913741223671, + "loss": 2.8672, + "theoretical_loss": 3.5274916491803947, + "tokens_seen": 1446064128 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837813440320963, + "loss": 2.6605, + "theoretical_loss": 3.527477455010448, + "tokens_seen": 1446129664 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837713139418255, + "loss": 2.7763, + "theoretical_loss": 3.5274632616638435, + "tokens_seen": 1446195200 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028376128385155467, + "loss": 2.8398, + "theoretical_loss": 3.527449069140496, + "tokens_seen": 1446260736 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028375125376128385, + "loss": 2.7989, + "theoretical_loss": 3.5274348774403195, + "tokens_seen": 1446326272 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837412236710131, + "loss": 2.8109, + "theoretical_loss": 3.5274206865632296, + "tokens_seen": 1446391808 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837311935807422, + "loss": 2.895, + "theoretical_loss": 3.527406496509141, + "tokens_seen": 1446457344 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028372116349047145, + "loss": 3.0066, + "theoretical_loss": 3.5273923072779687, + "tokens_seen": 1446522880 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028371113340020063, + "loss": 2.9373, + "theoretical_loss": 3.5273781188696276, + "tokens_seen": 1446588416 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2309991, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8522603511810303, + "objective/train/theoretical_loss": 3.5273639312840337, + "objective/train/tokens_used": 1467113952, + "theoretical_loss": 3.5273639312840337, + "tokens_seen": 1446653952 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002837011033099298, + "loss": 2.7304, + "theoretical_loss": 3.5273639312840337, + "tokens_seen": 1446653952 + }, + { + "epoch": 4.02, + "learning_rate": 0.000283691073219659, + "loss": 2.9044, + "theoretical_loss": 3.527349744521101, + "tokens_seen": 1446719488 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836810431293882, + "loss": 2.8699, + "theoretical_loss": 3.527335558580744, + "tokens_seen": 1446785024 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028367101303911735, + "loss": 2.7231, + "theoretical_loss": 3.5273213734628794, + "tokens_seen": 1446850560 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836609829488466, + "loss": 2.7159, + "theoretical_loss": 3.5273071891674213, + "tokens_seen": 1446916096 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836509528585757, + "loss": 2.925, + "theoretical_loss": 3.5272930056942844, + "tokens_seen": 1446981632 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028364092276830495, + "loss": 2.7715, + "theoretical_loss": 3.527278823043385, + "tokens_seen": 1447047168 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836308926780341, + "loss": 2.7616, + "theoretical_loss": 3.5272646412146367, + "tokens_seen": 1447112704 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836208625877633, + "loss": 2.8729, + "theoretical_loss": 3.5272504602079557, + "tokens_seen": 1447178240 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836108324974925, + "loss": 2.6944, + "theoretical_loss": 3.5272362800232564, + "tokens_seen": 1447243776 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002836008024072217, + "loss": 2.6784, + "theoretical_loss": 3.527222100660455, + "tokens_seen": 1447309312 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028359077231695086, + "loss": 2.8354, + "theoretical_loss": 3.527207922119466, + "tokens_seen": 1447374848 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002835807422266801, + "loss": 2.7637, + "theoretical_loss": 3.527193744400204, + "tokens_seen": 1447440384 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002835707121364092, + "loss": 2.7359, + "theoretical_loss": 3.5271795675025848, + "tokens_seen": 1447505920 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028356068204613846, + "loss": 2.8545, + "theoretical_loss": 3.5271653914265233, + "tokens_seen": 1447571456 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002835506519558676, + "loss": 2.8486, + "theoretical_loss": 3.5271512161719354, + "tokens_seen": 1447636992 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002835406218655968, + "loss": 2.7471, + "theoretical_loss": 3.5271370417387353, + "tokens_seen": 1447702528 + }, + { + "epoch": 4.02, + "learning_rate": 0.000283530591775326, + "loss": 2.8869, + "theoretical_loss": 3.527122868126839, + "tokens_seen": 1447768064 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002835205616850552, + "loss": 2.7947, + "theoretical_loss": 3.5271086953361617, + "tokens_seen": 1447833600 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028351053159478436, + "loss": 2.8655, + "theoretical_loss": 3.5270945233666176, + "tokens_seen": 1447899136 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028350050150451354, + "loss": 2.788, + "theoretical_loss": 3.5270803522181238, + "tokens_seen": 1447964672 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002834904714142427, + "loss": 2.7733, + "theoretical_loss": 3.5270661818905937, + "tokens_seen": 1448030208 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028348044132397196, + "loss": 2.8014, + "theoretical_loss": 3.527052012383944, + "tokens_seen": 1448095744 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002834704112337011, + "loss": 2.8002, + "theoretical_loss": 3.527037843698089, + "tokens_seen": 1448161280 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002834603811434303, + "loss": 2.8084, + "theoretical_loss": 3.527023675832945, + "tokens_seen": 1448226816 + }, + { + "epoch": 4.02, + "objective/train/docs_used": 2312674, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7035973072052, + "objective/train/theoretical_loss": 3.527009508788426, + "objective/train/tokens_used": 1468752352, + "theoretical_loss": 3.527009508788426, + "tokens_seen": 1448292352 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028345035105315945, + "loss": 2.9201, + "theoretical_loss": 3.527009508788426, + "tokens_seen": 1448292352 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002834403209628887, + "loss": 2.9134, + "theoretical_loss": 3.526995342564449, + "tokens_seen": 1448357888 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028343029087261786, + "loss": 2.6411, + "theoretical_loss": 3.526981177160928, + "tokens_seen": 1448423424 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028342026078234705, + "loss": 2.8774, + "theoretical_loss": 3.526967012577779, + "tokens_seen": 1448488960 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002834102306920762, + "loss": 2.8153, + "theoretical_loss": 3.5269528488149167, + "tokens_seen": 1448554496 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028340020060180546, + "loss": 2.7821, + "theoretical_loss": 3.526938685872257, + "tokens_seen": 1448620032 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002833901705115346, + "loss": 2.7071, + "theoretical_loss": 3.526924523749716, + "tokens_seen": 1448685568 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002833801404212638, + "loss": 2.6751, + "theoretical_loss": 3.5269103624472082, + "tokens_seen": 1448751104 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028337011033099295, + "loss": 2.8095, + "theoretical_loss": 3.5268962019646493, + "tokens_seen": 1448816640 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002833600802407222, + "loss": 2.7579, + "theoretical_loss": 3.526882042301954, + "tokens_seen": 1448882176 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028335005015045137, + "loss": 2.8128, + "theoretical_loss": 3.526867883459039, + "tokens_seen": 1448947712 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028334002006018055, + "loss": 2.8248, + "theoretical_loss": 3.526853725435819, + "tokens_seen": 1449013248 + }, + { + "epoch": 4.02, + "learning_rate": 0.00028332998996990973, + "loss": 2.9299, + "theoretical_loss": 3.5268395682322096, + "tokens_seen": 1449078784 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002833199598796389, + "loss": 2.8143, + "theoretical_loss": 3.5268254118481264, + "tokens_seen": 1449144320 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002833099297893681, + "loss": 2.7031, + "theoretical_loss": 3.5268112562834846, + "tokens_seen": 1449209856 + }, + { + "epoch": 4.02, + "learning_rate": 0.0002832998996990973, + "loss": 2.8151, + "theoretical_loss": 3.5267971015382003, + "tokens_seen": 1449275392 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028328986960882645, + "loss": 2.725, + "theoretical_loss": 3.5267829476121886, + "tokens_seen": 1449340928 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002832798395185557, + "loss": 2.8344, + "theoretical_loss": 3.526768794505365, + "tokens_seen": 1449406464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002832698094282848, + "loss": 2.8335, + "theoretical_loss": 3.5267546422176452, + "tokens_seen": 1449472000 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028325977933801405, + "loss": 2.88, + "theoretical_loss": 3.526740490748945, + "tokens_seen": 1449537536 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028324974924774323, + "loss": 2.7802, + "theoretical_loss": 3.526726340099179, + "tokens_seen": 1449603072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002832397191574724, + "loss": 2.873, + "theoretical_loss": 3.526712190268264, + "tokens_seen": 1449668608 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002832296890672016, + "loss": 2.792, + "theoretical_loss": 3.5266980412561155, + "tokens_seen": 1449734144 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028321965897693083, + "loss": 2.7892, + "theoretical_loss": 3.526683893062648, + "tokens_seen": 1449799680 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028320962888665996, + "loss": 2.8334, + "theoretical_loss": 3.5266697456877782, + "tokens_seen": 1449865216 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2315519, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3714334964752197, + "objective/train/theoretical_loss": 3.5266555991314217, + "objective/train/tokens_used": 1470390752, + "theoretical_loss": 3.5266555991314217, + "tokens_seen": 1449930752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831995987963892, + "loss": 2.6396, + "theoretical_loss": 3.5266555991314217, + "tokens_seen": 1449930752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831895687061183, + "loss": 2.766, + "theoretical_loss": 3.526641453393493, + "tokens_seen": 1449996288 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028317953861584755, + "loss": 2.6484, + "theoretical_loss": 3.526627308473909, + "tokens_seen": 1450061824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028316950852557674, + "loss": 2.8603, + "theoretical_loss": 3.526613164372585, + "tokens_seen": 1450127360 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831594784353059, + "loss": 2.9852, + "theoretical_loss": 3.526599021089437, + "tokens_seen": 1450192896 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831494483450351, + "loss": 2.78, + "theoretical_loss": 3.5265848786243796, + "tokens_seen": 1450258432 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831394182547643, + "loss": 2.6929, + "theoretical_loss": 3.5265707369773294, + "tokens_seen": 1450323968 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028312938816449346, + "loss": 2.9374, + "theoretical_loss": 3.526556596148202, + "tokens_seen": 1450389504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831193580742227, + "loss": 2.8474, + "theoretical_loss": 3.526542456136914, + "tokens_seen": 1450455040 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002831093279839518, + "loss": 2.837, + "theoretical_loss": 3.5265283169433794, + "tokens_seen": 1450520576 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028309929789368106, + "loss": 2.7395, + "theoretical_loss": 3.526514178567515, + "tokens_seen": 1450586112 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830892678034102, + "loss": 2.8228, + "theoretical_loss": 3.5265000410092364, + "tokens_seen": 1450651648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830792377131394, + "loss": 2.8425, + "theoretical_loss": 3.5264859042684598, + "tokens_seen": 1450717184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830692076228686, + "loss": 2.7803, + "theoretical_loss": 3.5264717683451003, + "tokens_seen": 1450782720 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830591775325978, + "loss": 2.822, + "theoretical_loss": 3.5264576332390742, + "tokens_seen": 1450848256 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028304914744232696, + "loss": 2.7812, + "theoretical_loss": 3.5264434989502966, + "tokens_seen": 1450913792 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830391173520562, + "loss": 2.7968, + "theoretical_loss": 3.5264293654786845, + "tokens_seen": 1450979328 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002830290872617854, + "loss": 2.7828, + "theoretical_loss": 3.526415232824153, + "tokens_seen": 1451044864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028301905717151456, + "loss": 2.7981, + "theoretical_loss": 3.526401100986618, + "tokens_seen": 1451110400 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028300902708124374, + "loss": 2.8553, + "theoretical_loss": 3.5263869699659955, + "tokens_seen": 1451175936 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002829989969909729, + "loss": 2.7644, + "theoretical_loss": 3.5263728397622014, + "tokens_seen": 1451241472 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028298896690070216, + "loss": 2.7783, + "theoretical_loss": 3.5263587103751517, + "tokens_seen": 1451307008 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002829789368104313, + "loss": 2.8859, + "theoretical_loss": 3.5263445818047616, + "tokens_seen": 1451372544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002829689067201605, + "loss": 2.7238, + "theoretical_loss": 3.526330454050948, + "tokens_seen": 1451438080 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028295887662988965, + "loss": 2.7779, + "theoretical_loss": 3.5263163271136264, + "tokens_seen": 1451503616 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2316982, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9037554264068604, + "objective/train/theoretical_loss": 3.526302200992713, + "objective/train/tokens_used": 1472029152, + "theoretical_loss": 3.526302200992713, + "tokens_seen": 1451569152 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002829488465396189, + "loss": 2.8048, + "theoretical_loss": 3.526302200992713, + "tokens_seen": 1451569152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028293881644934806, + "loss": 2.9085, + "theoretical_loss": 3.5262880756881234, + "tokens_seen": 1451634688 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028292878635907725, + "loss": 2.9644, + "theoretical_loss": 3.526273951199774, + "tokens_seen": 1451700224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002829187562688064, + "loss": 2.6991, + "theoretical_loss": 3.5262598275275803, + "tokens_seen": 1451765760 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028290872617853566, + "loss": 2.8859, + "theoretical_loss": 3.5262457046714584, + "tokens_seen": 1451831296 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002828986960882648, + "loss": 2.8543, + "theoretical_loss": 3.5262315826313246, + "tokens_seen": 1451896832 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282888665997994, + "loss": 2.9123, + "theoretical_loss": 3.5262174614070947, + "tokens_seen": 1451962368 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028287863590772315, + "loss": 2.7503, + "theoretical_loss": 3.526203340998684, + "tokens_seen": 1452027904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002828686058174524, + "loss": 2.8642, + "theoretical_loss": 3.5261892214060104, + "tokens_seen": 1452093440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028285857572718157, + "loss": 2.8969, + "theoretical_loss": 3.5261751026289887, + "tokens_seen": 1452158976 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028284854563691075, + "loss": 2.8372, + "theoretical_loss": 3.526160984667535, + "tokens_seen": 1452224512 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028283851554663993, + "loss": 2.878, + "theoretical_loss": 3.5261468675215655, + "tokens_seen": 1452290048 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002828284854563691, + "loss": 2.7106, + "theoretical_loss": 3.5261327511909966, + "tokens_seen": 1452355584 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002828184553660983, + "loss": 2.9046, + "theoretical_loss": 3.5261186356757444, + "tokens_seen": 1452421120 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002828084252758275, + "loss": 2.8667, + "theoretical_loss": 3.526104520975724, + "tokens_seen": 1452486656 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028279839518555665, + "loss": 2.8555, + "theoretical_loss": 3.5260904070908534, + "tokens_seen": 1452552192 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002827883650952859, + "loss": 2.8042, + "theoretical_loss": 3.526076294021047, + "tokens_seen": 1452617728 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282778335005015, + "loss": 2.6735, + "theoretical_loss": 3.5260621817662217, + "tokens_seen": 1452683264 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028276830491474425, + "loss": 3.0151, + "theoretical_loss": 3.5260480703262935, + "tokens_seen": 1452748800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028275827482447343, + "loss": 2.9027, + "theoretical_loss": 3.526033959701179, + "tokens_seen": 1452814336 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002827482447342026, + "loss": 2.7204, + "theoretical_loss": 3.526019849890794, + "tokens_seen": 1452879872 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002827382146439318, + "loss": 2.761, + "theoretical_loss": 3.5260057408950547, + "tokens_seen": 1452945408 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028272818455366103, + "loss": 2.9243, + "theoretical_loss": 3.5259916327138776, + "tokens_seen": 1453010944 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028271815446339016, + "loss": 2.9187, + "theoretical_loss": 3.525977525347179, + "tokens_seen": 1453076480 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002827081243731194, + "loss": 2.7721, + "theoretical_loss": 3.5259634187948743, + "tokens_seen": 1453142016 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2320009, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.014775514602661, + "objective/train/theoretical_loss": 3.5259493130568806, + "objective/train/tokens_used": 1473667552, + "theoretical_loss": 3.5259493130568806, + "tokens_seen": 1453207552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002826980942828485, + "loss": 2.7904, + "theoretical_loss": 3.5259493130568806, + "tokens_seen": 1453207552 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028268806419257775, + "loss": 2.7454, + "theoretical_loss": 3.5259352081331143, + "tokens_seen": 1453273088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028267803410230694, + "loss": 2.7129, + "theoretical_loss": 3.5259211040234915, + "tokens_seen": 1453338624 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002826680040120361, + "loss": 2.7553, + "theoretical_loss": 3.525907000727928, + "tokens_seen": 1453404160 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002826579739217653, + "loss": 2.7552, + "theoretical_loss": 3.5258928982463402, + "tokens_seen": 1453469696 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002826479438314945, + "loss": 2.6499, + "theoretical_loss": 3.525878796578645, + "tokens_seen": 1453535232 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028263791374122366, + "loss": 2.8047, + "theoretical_loss": 3.525864695724758, + "tokens_seen": 1453600768 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002826278836509529, + "loss": 2.7931, + "theoretical_loss": 3.525850595684597, + "tokens_seen": 1453666304 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282617853560682, + "loss": 2.6674, + "theoretical_loss": 3.5258364964580764, + "tokens_seen": 1453731840 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028260782347041126, + "loss": 2.8971, + "theoretical_loss": 3.5258223980451135, + "tokens_seen": 1453797376 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825977933801404, + "loss": 2.8992, + "theoretical_loss": 3.525808300445625, + "tokens_seen": 1453862912 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825877632898696, + "loss": 2.9014, + "theoretical_loss": 3.525794203659527, + "tokens_seen": 1453928448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825777331995988, + "loss": 2.8737, + "theoretical_loss": 3.525780107686735, + "tokens_seen": 1453993984 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282567703109328, + "loss": 2.7167, + "theoretical_loss": 3.5257660125271677, + "tokens_seen": 1454059520 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028255767301905716, + "loss": 2.9171, + "theoretical_loss": 3.5257519181807395, + "tokens_seen": 1454125056 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825476429287864, + "loss": 2.7919, + "theoretical_loss": 3.5257378246473676, + "tokens_seen": 1454190592 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825376128385155, + "loss": 2.889, + "theoretical_loss": 3.525723731926968, + "tokens_seen": 1454256128 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028252758274824476, + "loss": 2.8744, + "theoretical_loss": 3.525709640019458, + "tokens_seen": 1454321664 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825175526579739, + "loss": 2.7128, + "theoretical_loss": 3.5256955489247535, + "tokens_seen": 1454387200 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002825075225677031, + "loss": 2.766, + "theoretical_loss": 3.5256814586427705, + "tokens_seen": 1454452736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002824974924774323, + "loss": 2.8257, + "theoretical_loss": 3.5256673691734264, + "tokens_seen": 1454518272 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002824874623871615, + "loss": 2.8385, + "theoretical_loss": 3.5256532805166376, + "tokens_seen": 1454583808 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028247743229689067, + "loss": 2.9484, + "theoretical_loss": 3.5256391926723207, + "tokens_seen": 1454649344 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028246740220661985, + "loss": 2.8556, + "theoretical_loss": 3.5256251056403918, + "tokens_seen": 1454714880 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028245737211634903, + "loss": 2.6747, + "theoretical_loss": 3.5256110194207673, + "tokens_seen": 1454780416 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2322855, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.942119836807251, + "objective/train/theoretical_loss": 3.5255969340133646, + "objective/train/tokens_used": 1475305952, + "theoretical_loss": 3.5255969340133646, + "tokens_seen": 1454845952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028244734202607826, + "loss": 2.9124, + "theoretical_loss": 3.5255969340133646, + "tokens_seen": 1454845952 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002824373119358074, + "loss": 2.7977, + "theoretical_loss": 3.5255828494180994, + "tokens_seen": 1454911488 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002824272818455366, + "loss": 2.7714, + "theoretical_loss": 3.5255687656348886, + "tokens_seen": 1454977024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002824172517552658, + "loss": 2.8817, + "theoretical_loss": 3.5255546826636497, + "tokens_seen": 1455042560 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282407221664995, + "loss": 2.8209, + "theoretical_loss": 3.525540600504298, + "tokens_seen": 1455108096 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028239719157472417, + "loss": 2.8436, + "theoretical_loss": 3.5255265191567506, + "tokens_seen": 1455173632 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028238716148445335, + "loss": 2.6262, + "theoretical_loss": 3.5255124386209244, + "tokens_seen": 1455239168 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028237713139418253, + "loss": 2.7998, + "theoretical_loss": 3.5254983588967357, + "tokens_seen": 1455304704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028236710130391177, + "loss": 2.7876, + "theoretical_loss": 3.5254842799841013, + "tokens_seen": 1455370240 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002823570712136409, + "loss": 2.8759, + "theoretical_loss": 3.5254702018829382, + "tokens_seen": 1455435776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028234704112337013, + "loss": 2.9287, + "theoretical_loss": 3.5254561245931626, + "tokens_seen": 1455501312 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028233701103309926, + "loss": 2.691, + "theoretical_loss": 3.5254420481146913, + "tokens_seen": 1455566848 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002823269809428285, + "loss": 2.7865, + "theoretical_loss": 3.5254279724474413, + "tokens_seen": 1455632384 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002823169508525577, + "loss": 2.9158, + "theoretical_loss": 3.525413897591329, + "tokens_seen": 1455697920 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028230692076228685, + "loss": 2.9199, + "theoretical_loss": 3.5253998235462713, + "tokens_seen": 1455763456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028229689067201604, + "loss": 2.844, + "theoretical_loss": 3.5253857503121853, + "tokens_seen": 1455828992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002822868605817452, + "loss": 2.9008, + "theoretical_loss": 3.5253716778889874, + "tokens_seen": 1455894528 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028227683049147445, + "loss": 2.6755, + "theoretical_loss": 3.525357606276594, + "tokens_seen": 1455960064 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028226680040120363, + "loss": 2.7155, + "theoretical_loss": 3.5253435354749225, + "tokens_seen": 1456025600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002822567703109328, + "loss": 2.844, + "theoretical_loss": 3.525329465483889, + "tokens_seen": 1456091136 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282246740220662, + "loss": 2.7453, + "theoretical_loss": 3.5253153963034114, + "tokens_seen": 1456156672 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028223671013039123, + "loss": 2.8721, + "theoretical_loss": 3.525301327933406, + "tokens_seen": 1456222208 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028222668004012036, + "loss": 2.7413, + "theoretical_loss": 3.5252872603737893, + "tokens_seen": 1456287744 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002822166499498496, + "loss": 2.8744, + "theoretical_loss": 3.5252731936244786, + "tokens_seen": 1456353280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002822066198595787, + "loss": 3.0024, + "theoretical_loss": 3.5252591276853904, + "tokens_seen": 1456418816 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2325909, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.713939905166626, + "objective/train/theoretical_loss": 3.5252450625564418, + "objective/train/tokens_used": 1476944352, + "theoretical_loss": 3.5252450625564418, + "tokens_seen": 1456484352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028219658976930795, + "loss": 2.8709, + "theoretical_loss": 3.5252450625564418, + "tokens_seen": 1456484352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028218655967903714, + "loss": 2.7143, + "theoretical_loss": 3.5252309982375496, + "tokens_seen": 1456549888 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821765295887663, + "loss": 2.657, + "theoretical_loss": 3.525216934728631, + "tokens_seen": 1456615424 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821664994984955, + "loss": 2.8746, + "theoretical_loss": 3.5252028720296025, + "tokens_seen": 1456680960 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821564694082247, + "loss": 2.7491, + "theoretical_loss": 3.5251888101403814, + "tokens_seen": 1456746496 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028214643931795386, + "loss": 2.7772, + "theoretical_loss": 3.5251747490608842, + "tokens_seen": 1456812032 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821364092276831, + "loss": 2.8027, + "theoretical_loss": 3.525160688791028, + "tokens_seen": 1456877568 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821263791374122, + "loss": 2.8064, + "theoretical_loss": 3.52514662933073, + "tokens_seen": 1456943104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028211634904714146, + "loss": 2.8326, + "theoretical_loss": 3.525132570679907, + "tokens_seen": 1457008640 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002821063189568706, + "loss": 2.8052, + "theoretical_loss": 3.5251185128384757, + "tokens_seen": 1457074176 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820962888665998, + "loss": 2.6964, + "theoretical_loss": 3.5251044558063542, + "tokens_seen": 1457139712 + }, + { + "epoch": 4.03, + "learning_rate": 0.000282086258776329, + "loss": 2.7494, + "theoretical_loss": 3.5250903995834584, + "tokens_seen": 1457205248 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820762286860582, + "loss": 2.9094, + "theoretical_loss": 3.5250763441697055, + "tokens_seen": 1457270784 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028206619859578736, + "loss": 2.9352, + "theoretical_loss": 3.5250622895650126, + "tokens_seen": 1457336320 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820561685055166, + "loss": 2.6658, + "theoretical_loss": 3.525048235769297, + "tokens_seen": 1457401856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820461384152457, + "loss": 2.8643, + "theoretical_loss": 3.525034182782475, + "tokens_seen": 1457467392 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028203610832497496, + "loss": 2.8035, + "theoretical_loss": 3.5250201306044655, + "tokens_seen": 1457532928 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820260782347041, + "loss": 2.7148, + "theoretical_loss": 3.5250060792351836, + "tokens_seen": 1457598464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820160481444333, + "loss": 2.8478, + "theoretical_loss": 3.5249920286745473, + "tokens_seen": 1457664000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002820060180541625, + "loss": 2.7417, + "theoretical_loss": 3.5249779789224736, + "tokens_seen": 1457729536 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002819959879638917, + "loss": 2.9713, + "theoretical_loss": 3.5249639299788793, + "tokens_seen": 1457795072 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028198595787362087, + "loss": 2.6949, + "theoretical_loss": 3.524949881843682, + "tokens_seen": 1457860608 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028197592778335005, + "loss": 2.7665, + "theoretical_loss": 3.524935834516799, + "tokens_seen": 1457926144 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028196589769307923, + "loss": 2.9374, + "theoretical_loss": 3.5249217879981467, + "tokens_seen": 1457991680 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028195586760280846, + "loss": 2.6887, + "theoretical_loss": 3.5249077422876427, + "tokens_seen": 1458057216 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2328927, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5560050010681152, + "objective/train/theoretical_loss": 3.5248936973852043, + "objective/train/tokens_used": 1478582752, + "theoretical_loss": 3.5248936973852043, + "tokens_seen": 1458122752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002819458375125376, + "loss": 2.7349, + "theoretical_loss": 3.5248936973852043, + "tokens_seen": 1458122752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002819358074222668, + "loss": 2.8474, + "theoretical_loss": 3.524879653290748, + "tokens_seen": 1458188288 + }, + { + "epoch": 4.03, + "learning_rate": 0.000281925777331996, + "loss": 2.7937, + "theoretical_loss": 3.524865610004192, + "tokens_seen": 1458253824 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002819157472417252, + "loss": 2.8185, + "theoretical_loss": 3.524851567525454, + "tokens_seen": 1458319360 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028190571715145437, + "loss": 2.6779, + "theoretical_loss": 3.5248375258544495, + "tokens_seen": 1458384896 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028189568706118355, + "loss": 2.7401, + "theoretical_loss": 3.5248234849910967, + "tokens_seen": 1458450432 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028188565697091273, + "loss": 2.8371, + "theoretical_loss": 3.524809444935313, + "tokens_seen": 1458515968 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028187562688064197, + "loss": 2.7926, + "theoretical_loss": 3.5247954056870148, + "tokens_seen": 1458581504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002818655967903711, + "loss": 2.7882, + "theoretical_loss": 3.52478136724612, + "tokens_seen": 1458647040 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028185556670010033, + "loss": 2.7924, + "theoretical_loss": 3.5247673296125464, + "tokens_seen": 1458712576 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028184553660982946, + "loss": 2.8027, + "theoretical_loss": 3.5247532927862104, + "tokens_seen": 1458778112 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002818355065195587, + "loss": 2.7375, + "theoretical_loss": 3.52473925676703, + "tokens_seen": 1458843648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002818254764292879, + "loss": 2.7847, + "theoretical_loss": 3.524725221554922, + "tokens_seen": 1458909184 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028181544633901705, + "loss": 2.7787, + "theoretical_loss": 3.524711187149804, + "tokens_seen": 1458974720 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028180541624874624, + "loss": 2.8421, + "theoretical_loss": 3.5246971535515934, + "tokens_seen": 1459040256 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817953861584754, + "loss": 2.9492, + "theoretical_loss": 3.524683120760207, + "tokens_seen": 1459105792 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817853560682046, + "loss": 2.8036, + "theoretical_loss": 3.524669088775563, + "tokens_seen": 1459171328 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028177532597793383, + "loss": 2.8581, + "theoretical_loss": 3.5246550575975792, + "tokens_seen": 1459236864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028176529588766296, + "loss": 2.7652, + "theoretical_loss": 3.5246410272261715, + "tokens_seen": 1459302400 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817552657973922, + "loss": 2.6985, + "theoretical_loss": 3.524626997661258, + "tokens_seen": 1459367936 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817452357071214, + "loss": 2.7886, + "theoretical_loss": 3.5246129689027565, + "tokens_seen": 1459433472 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028173520561685056, + "loss": 2.8487, + "theoretical_loss": 3.5245989409505842, + "tokens_seen": 1459499008 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028172517552657974, + "loss": 2.7678, + "theoretical_loss": 3.524584913804658, + "tokens_seen": 1459564544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817151454363089, + "loss": 2.7567, + "theoretical_loss": 3.524570887464896, + "tokens_seen": 1459630080 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002817051153460381, + "loss": 2.7494, + "theoretical_loss": 3.5245568619312158, + "tokens_seen": 1459695616 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2330225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.944275379180908, + "objective/train/theoretical_loss": 3.524542837203535, + "objective/train/tokens_used": 1480221152, + "theoretical_loss": 3.524542837203535, + "tokens_seen": 1459761152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028169508525576734, + "loss": 2.9982, + "theoretical_loss": 3.524542837203535, + "tokens_seen": 1459761152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028168505516549646, + "loss": 2.8227, + "theoretical_loss": 3.52452881328177, + "tokens_seen": 1459826688 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002816750250752257, + "loss": 2.7639, + "theoretical_loss": 3.5245147901658394, + "tokens_seen": 1459892224 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002816649949849548, + "loss": 2.8619, + "theoretical_loss": 3.52450076785566, + "tokens_seen": 1459957760 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028165496489468406, + "loss": 2.7876, + "theoretical_loss": 3.5244867463511502, + "tokens_seen": 1460023296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028164493480441324, + "loss": 2.8303, + "theoretical_loss": 3.5244727256522266, + "tokens_seen": 1460088832 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002816349047141424, + "loss": 2.6941, + "theoretical_loss": 3.5244587057588075, + "tokens_seen": 1460154368 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002816248746238716, + "loss": 2.9016, + "theoretical_loss": 3.52444468667081, + "tokens_seen": 1460219904 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002816148445336008, + "loss": 2.8804, + "theoretical_loss": 3.524430668388152, + "tokens_seen": 1460285440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028160481444332997, + "loss": 2.8629, + "theoretical_loss": 3.5244166509107506, + "tokens_seen": 1460350976 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815947843530592, + "loss": 2.8501, + "theoretical_loss": 3.524402634238524, + "tokens_seen": 1460416512 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028158475426278833, + "loss": 2.8762, + "theoretical_loss": 3.5243886183713897, + "tokens_seen": 1460482048 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028157472417251756, + "loss": 2.5663, + "theoretical_loss": 3.5243746033092656, + "tokens_seen": 1460547584 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028156469408224674, + "loss": 3.0241, + "theoretical_loss": 3.524360589052068, + "tokens_seen": 1460613120 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815546639919759, + "loss": 2.8473, + "theoretical_loss": 3.5243465755997163, + "tokens_seen": 1460678656 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815446339017051, + "loss": 2.918, + "theoretical_loss": 3.524332562952127, + "tokens_seen": 1460744192 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815346038114343, + "loss": 2.7954, + "theoretical_loss": 3.5243185511092183, + "tokens_seen": 1460809728 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815245737211635, + "loss": 2.8365, + "theoretical_loss": 3.524304540070908, + "tokens_seen": 1460875264 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815145436308927, + "loss": 2.8516, + "theoretical_loss": 3.524290529837113, + "tokens_seen": 1460940800 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002815045135406219, + "loss": 2.9666, + "theoretical_loss": 3.524276520407752, + "tokens_seen": 1461006336 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028149448345035107, + "loss": 2.9391, + "theoretical_loss": 3.524262511782742, + "tokens_seen": 1461071872 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028148445336008025, + "loss": 2.8194, + "theoretical_loss": 3.524248503962001, + "tokens_seen": 1461137408 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028147442326980943, + "loss": 2.8135, + "theoretical_loss": 3.5242344969454473, + "tokens_seen": 1461202944 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028146439317953866, + "loss": 2.7794, + "theoretical_loss": 3.5242204907329984, + "tokens_seen": 1461268480 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002814543630892678, + "loss": 2.9288, + "theoretical_loss": 3.524206485324571, + "tokens_seen": 1461334016 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2333652, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.783522844314575, + "objective/train/theoretical_loss": 3.5241924807200844, + "objective/train/tokens_used": 1481859552, + "theoretical_loss": 3.5241924807200844, + "tokens_seen": 1461399552 + }, + { + "epoch": 4.03, + "learning_rate": 0.000281444332998997, + "loss": 2.8649, + "theoretical_loss": 3.5241924807200844, + "tokens_seen": 1461399552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002814343029087262, + "loss": 2.8709, + "theoretical_loss": 3.5241784769194555, + "tokens_seen": 1461465088 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002814242728184554, + "loss": 2.833, + "theoretical_loss": 3.5241644739226023, + "tokens_seen": 1461530624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028141424272818457, + "loss": 2.7099, + "theoretical_loss": 3.524150471729443, + "tokens_seen": 1461596160 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028140421263791375, + "loss": 2.7834, + "theoretical_loss": 3.524136470339895, + "tokens_seen": 1461661696 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028139418254764293, + "loss": 2.7341, + "theoretical_loss": 3.524122469753876, + "tokens_seen": 1461727232 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028138415245737217, + "loss": 2.8198, + "theoretical_loss": 3.5241084699713046, + "tokens_seen": 1461792768 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002813741223671013, + "loss": 2.6718, + "theoretical_loss": 3.5240944709920985, + "tokens_seen": 1461858304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028136409227683053, + "loss": 2.7544, + "theoretical_loss": 3.5240804728161743, + "tokens_seen": 1461923840 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028135406218655966, + "loss": 2.7598, + "theoretical_loss": 3.5240664754434516, + "tokens_seen": 1461989376 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002813440320962889, + "loss": 2.9471, + "theoretical_loss": 3.5240524788738474, + "tokens_seen": 1462054912 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002813340020060181, + "loss": 2.7053, + "theoretical_loss": 3.52403848310728, + "tokens_seen": 1462120448 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028132397191574725, + "loss": 2.7788, + "theoretical_loss": 3.5240244881436675, + "tokens_seen": 1462185984 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028131394182547644, + "loss": 2.9369, + "theoretical_loss": 3.524010493982927, + "tokens_seen": 1462251520 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002813039117352056, + "loss": 2.7509, + "theoretical_loss": 3.523996500624977, + "tokens_seen": 1462317056 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002812938816449348, + "loss": 2.6229, + "theoretical_loss": 3.523982508069736, + "tokens_seen": 1462382592 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028128385155466403, + "loss": 2.7255, + "theoretical_loss": 3.5239685163171206, + "tokens_seen": 1462448128 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028127382146439316, + "loss": 3.0247, + "theoretical_loss": 3.52395452536705, + "tokens_seen": 1462513664 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002812637913741224, + "loss": 2.8565, + "theoretical_loss": 3.523940535219442, + "tokens_seen": 1462579200 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002812537612838516, + "loss": 2.9175, + "theoretical_loss": 3.5239265458742146, + "tokens_seen": 1462644736 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028124373119358076, + "loss": 2.7294, + "theoretical_loss": 3.5239125573312853, + "tokens_seen": 1462710272 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028123370110330994, + "loss": 2.7855, + "theoretical_loss": 3.523898569590573, + "tokens_seen": 1462775808 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002812236710130391, + "loss": 2.9251, + "theoretical_loss": 3.523884582651995, + "tokens_seen": 1462841344 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002812136409227683, + "loss": 2.7432, + "theoretical_loss": 3.5238705965154695, + "tokens_seen": 1462906880 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028120361083249754, + "loss": 2.7412, + "theoretical_loss": 3.523856611180915, + "tokens_seen": 1462972416 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2336250, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5999882221221924, + "objective/train/theoretical_loss": 3.5238426266482494, + "objective/train/tokens_used": 1483497952, + "theoretical_loss": 3.5238426266482494, + "tokens_seen": 1463037952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028119358074222666, + "loss": 2.5978, + "theoretical_loss": 3.5238426266482494, + "tokens_seen": 1463037952 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002811835506519559, + "loss": 2.7615, + "theoretical_loss": 3.523828642917391, + "tokens_seen": 1463103488 + }, + { + "epoch": 4.03, + "learning_rate": 0.000281173520561685, + "loss": 2.7823, + "theoretical_loss": 3.523814659988257, + "tokens_seen": 1463169024 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028116349047141426, + "loss": 2.7597, + "theoretical_loss": 3.5238006778607667, + "tokens_seen": 1463234560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028115346038114344, + "loss": 2.5147, + "theoretical_loss": 3.5237866965348372, + "tokens_seen": 1463300096 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002811434302908726, + "loss": 2.8833, + "theoretical_loss": 3.523772716010388, + "tokens_seen": 1463365632 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002811334002006018, + "loss": 2.7159, + "theoretical_loss": 3.523758736287336, + "tokens_seen": 1463431168 + }, + { + "epoch": 4.03, + "learning_rate": 0.000281123370110331, + "loss": 2.8196, + "theoretical_loss": 3.5237447573655998, + "tokens_seen": 1463496704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028111334002006017, + "loss": 2.9469, + "theoretical_loss": 3.5237307792450974, + "tokens_seen": 1463562240 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002811033099297894, + "loss": 2.9369, + "theoretical_loss": 3.5237168019257474, + "tokens_seen": 1463627776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028109327983951853, + "loss": 2.7259, + "theoretical_loss": 3.523702825407468, + "tokens_seen": 1463693312 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028108324974924776, + "loss": 2.8543, + "theoretical_loss": 3.5236888496901773, + "tokens_seen": 1463758848 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028107321965897695, + "loss": 2.7013, + "theoretical_loss": 3.5236748747737936, + "tokens_seen": 1463824384 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002810631895687061, + "loss": 2.8319, + "theoretical_loss": 3.5236609006582347, + "tokens_seen": 1463889920 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002810531594784353, + "loss": 2.8895, + "theoretical_loss": 3.523646927343419, + "tokens_seen": 1463955456 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002810431293881645, + "loss": 2.878, + "theoretical_loss": 3.5236329548292655, + "tokens_seen": 1464020992 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028103309929789367, + "loss": 2.8422, + "theoretical_loss": 3.5236189831156914, + "tokens_seen": 1464086528 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002810230692076229, + "loss": 2.7829, + "theoretical_loss": 3.5236050122026157, + "tokens_seen": 1464152064 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028101303911735203, + "loss": 2.8942, + "theoretical_loss": 3.523591042089957, + "tokens_seen": 1464217600 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028100300902708127, + "loss": 2.7706, + "theoretical_loss": 3.523577072777633, + "tokens_seen": 1464283136 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002809929789368104, + "loss": 2.8178, + "theoretical_loss": 3.523563104265562, + "tokens_seen": 1464348672 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028098294884653963, + "loss": 2.6872, + "theoretical_loss": 3.5235491365536626, + "tokens_seen": 1464414208 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002809729187562688, + "loss": 2.7173, + "theoretical_loss": 3.5235351696418533, + "tokens_seen": 1464479744 + }, + { + "epoch": 4.03, + "learning_rate": 0.000280962888665998, + "loss": 2.7767, + "theoretical_loss": 3.523521203530052, + "tokens_seen": 1464545280 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028095285857572717, + "loss": 2.9063, + "theoretical_loss": 3.523507238218178, + "tokens_seen": 1464610816 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2337620, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0054116249084473, + "objective/train/theoretical_loss": 3.5234932737061486, + "objective/train/tokens_used": 1485136352, + "theoretical_loss": 3.5234932737061486, + "tokens_seen": 1464676352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028094282848545635, + "loss": 2.8582, + "theoretical_loss": 3.5234932737061486, + "tokens_seen": 1464676352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028093279839518553, + "loss": 2.7149, + "theoretical_loss": 3.523479309993882, + "tokens_seen": 1464741888 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028092276830491477, + "loss": 2.8505, + "theoretical_loss": 3.5234653470812987, + "tokens_seen": 1464807424 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002809127382146439, + "loss": 2.8183, + "theoretical_loss": 3.5234513849683147, + "tokens_seen": 1464872960 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028090270812437313, + "loss": 2.7624, + "theoretical_loss": 3.5234374236548494, + "tokens_seen": 1464938496 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002808926780341023, + "loss": 2.7594, + "theoretical_loss": 3.523423463140822, + "tokens_seen": 1465004032 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002808826479438315, + "loss": 2.7116, + "theoretical_loss": 3.5234095034261497, + "tokens_seen": 1465069568 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002808726178535607, + "loss": 2.7287, + "theoretical_loss": 3.523395544510752, + "tokens_seen": 1465135104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028086258776328986, + "loss": 2.7167, + "theoretical_loss": 3.523381586394546, + "tokens_seen": 1465200640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028085255767301904, + "loss": 2.6956, + "theoretical_loss": 3.523367629077452, + "tokens_seen": 1465266176 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002808425275827483, + "loss": 2.8535, + "theoretical_loss": 3.5233536725593875, + "tokens_seen": 1465331712 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002808324974924774, + "loss": 2.7903, + "theoretical_loss": 3.5233397168402707, + "tokens_seen": 1465397248 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028082246740220664, + "loss": 2.9268, + "theoretical_loss": 3.523325761920021, + "tokens_seen": 1465462784 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028081243731193576, + "loss": 2.8769, + "theoretical_loss": 3.5233118077985566, + "tokens_seen": 1465528320 + }, + { + "epoch": 4.03, + "learning_rate": 0.000280802407221665, + "loss": 2.878, + "theoretical_loss": 3.5232978544757962, + "tokens_seen": 1465593856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002807923771313942, + "loss": 2.8935, + "theoretical_loss": 3.5232839019516575, + "tokens_seen": 1465659392 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028078234704112336, + "loss": 2.8031, + "theoretical_loss": 3.5232699502260605, + "tokens_seen": 1465724928 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002807723169508526, + "loss": 2.7038, + "theoretical_loss": 3.5232559992989225, + "tokens_seen": 1465790464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002807622868605818, + "loss": 2.7854, + "theoretical_loss": 3.523242049170163, + "tokens_seen": 1465856000 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028075225677031096, + "loss": 2.7101, + "theoretical_loss": 3.5232280998397, + "tokens_seen": 1465921536 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028074222668004014, + "loss": 2.9451, + "theoretical_loss": 3.5232141513074526, + "tokens_seen": 1465987072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002807321965897693, + "loss": 2.9059, + "theoretical_loss": 3.5232002035733396, + "tokens_seen": 1466052608 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002807221664994985, + "loss": 2.7513, + "theoretical_loss": 3.5231862566372785, + "tokens_seen": 1466118144 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028071213640922774, + "loss": 2.7292, + "theoretical_loss": 3.5231723104991897, + "tokens_seen": 1466183680 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028070210631895686, + "loss": 2.7835, + "theoretical_loss": 3.52315836515899, + "tokens_seen": 1466249216 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2340479, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.843994379043579, + "objective/train/theoretical_loss": 3.5231444206165996, + "objective/train/tokens_used": 1486774752, + "theoretical_loss": 3.5231444206165996, + "tokens_seen": 1466314752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002806920762286861, + "loss": 2.9063, + "theoretical_loss": 3.5231444206165996, + "tokens_seen": 1466314752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002806820461384152, + "loss": 2.7115, + "theoretical_loss": 3.5231304768719367, + "tokens_seen": 1466380288 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028067201604814446, + "loss": 2.7548, + "theoretical_loss": 3.52311653392492, + "tokens_seen": 1466445824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028066198595787364, + "loss": 2.8717, + "theoretical_loss": 3.5231025917754684, + "tokens_seen": 1466511360 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002806519558676028, + "loss": 2.7828, + "theoretical_loss": 3.5230886504234995, + "tokens_seen": 1466576896 + }, + { + "epoch": 4.03, + "learning_rate": 0.000280641925777332, + "loss": 2.9171, + "theoretical_loss": 3.5230747098689337, + "tokens_seen": 1466642432 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002806318956870612, + "loss": 2.8976, + "theoretical_loss": 3.523060770111689, + "tokens_seen": 1466707968 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028062186559679037, + "loss": 2.8851, + "theoretical_loss": 3.5230468311516843, + "tokens_seen": 1466773504 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002806118355065196, + "loss": 2.7763, + "theoretical_loss": 3.523032892988838, + "tokens_seen": 1466839040 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028060180541624873, + "loss": 2.7751, + "theoretical_loss": 3.52301895562307, + "tokens_seen": 1466904576 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028059177532597796, + "loss": 2.8348, + "theoretical_loss": 3.5230050190542976, + "tokens_seen": 1466970112 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028058174523570715, + "loss": 2.8703, + "theoretical_loss": 3.5229910832824407, + "tokens_seen": 1467035648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002805717151454363, + "loss": 2.8795, + "theoretical_loss": 3.522977148307417, + "tokens_seen": 1467101184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002805616850551655, + "loss": 2.934, + "theoretical_loss": 3.522963214129147, + "tokens_seen": 1467166720 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002805516549648947, + "loss": 2.8597, + "theoretical_loss": 3.5229492807475484, + "tokens_seen": 1467232256 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028054162487462387, + "loss": 2.7703, + "theoretical_loss": 3.52293534816254, + "tokens_seen": 1467297792 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002805315947843531, + "loss": 2.7444, + "theoretical_loss": 3.5229214163740417, + "tokens_seen": 1467363328 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028052156469408223, + "loss": 2.7645, + "theoretical_loss": 3.5229074853819715, + "tokens_seen": 1467428864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028051153460381147, + "loss": 2.6134, + "theoretical_loss": 3.5228935551862484, + "tokens_seen": 1467494400 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002805015045135406, + "loss": 2.8645, + "theoretical_loss": 3.5228796257867914, + "tokens_seen": 1467559936 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028049147442326983, + "loss": 2.7494, + "theoretical_loss": 3.5228656971835193, + "tokens_seen": 1467625472 + }, + { + "epoch": 4.03, + "learning_rate": 0.000280481444332999, + "loss": 2.634, + "theoretical_loss": 3.5228517693763517, + "tokens_seen": 1467691008 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002804714142427282, + "loss": 2.7738, + "theoretical_loss": 3.522837842365207, + "tokens_seen": 1467756544 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028046138415245737, + "loss": 2.8957, + "theoretical_loss": 3.5228239161500037, + "tokens_seen": 1467822080 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028045135406218655, + "loss": 2.651, + "theoretical_loss": 3.5228099907306616, + "tokens_seen": 1467887616 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2343295, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.754305601119995, + "objective/train/theoretical_loss": 3.5227960661071, + "objective/train/tokens_used": 1488413152, + "theoretical_loss": 3.5227960661071, + "tokens_seen": 1467953152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028044132397191574, + "loss": 2.7762, + "theoretical_loss": 3.5227960661071, + "tokens_seen": 1467953152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028043129388164497, + "loss": 2.7649, + "theoretical_loss": 3.5227821422792367, + "tokens_seen": 1468018688 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002804212637913741, + "loss": 2.8539, + "theoretical_loss": 3.5227682192469914, + "tokens_seen": 1468084224 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028041123370110333, + "loss": 2.9334, + "theoretical_loss": 3.522754297010283, + "tokens_seen": 1468149760 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002804012036108325, + "loss": 2.8146, + "theoretical_loss": 3.5227403755690307, + "tokens_seen": 1468215296 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803911735205617, + "loss": 2.7455, + "theoretical_loss": 3.522726454923153, + "tokens_seen": 1468280832 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803811434302909, + "loss": 2.8658, + "theoretical_loss": 3.5227125350725697, + "tokens_seen": 1468346368 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028037111334002006, + "loss": 2.8597, + "theoretical_loss": 3.5226986160172, + "tokens_seen": 1468411904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028036108324974924, + "loss": 2.9145, + "theoretical_loss": 3.522684697756962, + "tokens_seen": 1468477440 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803510531594785, + "loss": 2.8104, + "theoretical_loss": 3.5226707802917754, + "tokens_seen": 1468542976 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803410230692076, + "loss": 2.9379, + "theoretical_loss": 3.5226568636215596, + "tokens_seen": 1468608512 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028033099297893684, + "loss": 2.9091, + "theoretical_loss": 3.522642947746233, + "tokens_seen": 1468674048 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028032096288866596, + "loss": 2.6289, + "theoretical_loss": 3.522629032665715, + "tokens_seen": 1468739584 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803109327983952, + "loss": 2.8754, + "theoretical_loss": 3.5226151183799246, + "tokens_seen": 1468805120 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002803009027081244, + "loss": 2.7678, + "theoretical_loss": 3.5226012048887823, + "tokens_seen": 1468870656 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028029087261785356, + "loss": 2.7295, + "theoretical_loss": 3.522587292192205, + "tokens_seen": 1468936192 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028028084252758274, + "loss": 2.7193, + "theoretical_loss": 3.5225733802901136, + "tokens_seen": 1469001728 + }, + { + "epoch": 4.03, + "learning_rate": 0.000280270812437312, + "loss": 2.7376, + "theoretical_loss": 3.5225594691824265, + "tokens_seen": 1469067264 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002802607823470411, + "loss": 2.705, + "theoretical_loss": 3.522545558869063, + "tokens_seen": 1469132800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028025075225677034, + "loss": 2.6457, + "theoretical_loss": 3.5225316493499426, + "tokens_seen": 1469198336 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028024072216649947, + "loss": 2.8456, + "theoretical_loss": 3.522517740624984, + "tokens_seen": 1469263872 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002802306920762287, + "loss": 2.749, + "theoretical_loss": 3.522503832694108, + "tokens_seen": 1469329408 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002802206619859579, + "loss": 2.6344, + "theoretical_loss": 3.522489925557231, + "tokens_seen": 1469394944 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028021063189568706, + "loss": 2.7232, + "theoretical_loss": 3.5224760192142752, + "tokens_seen": 1469460480 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028020060180541624, + "loss": 2.7491, + "theoretical_loss": 3.522462113665158, + "tokens_seen": 1469526016 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2346341, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8552396297454834, + "objective/train/theoretical_loss": 3.5224482089097995, + "objective/train/tokens_used": 1490051552, + "theoretical_loss": 3.5224482089097995, + "tokens_seen": 1469591552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002801905717151454, + "loss": 2.8628, + "theoretical_loss": 3.5224482089097995, + "tokens_seen": 1469591552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002801805416248746, + "loss": 2.6797, + "theoretical_loss": 3.522434304948119, + "tokens_seen": 1469657088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028017051153460384, + "loss": 2.7442, + "theoretical_loss": 3.5224204017800345, + "tokens_seen": 1469722624 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028016048144433297, + "loss": 2.8965, + "theoretical_loss": 3.5224064994054674, + "tokens_seen": 1469788160 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002801504513540622, + "loss": 2.8937, + "theoretical_loss": 3.522392597824336, + "tokens_seen": 1469853696 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028014042126379133, + "loss": 2.6993, + "theoretical_loss": 3.522378697036559, + "tokens_seen": 1469919232 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028013039117352057, + "loss": 2.7676, + "theoretical_loss": 3.522364797042057, + "tokens_seen": 1469984768 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028012036108324975, + "loss": 2.7633, + "theoretical_loss": 3.5223508978407487, + "tokens_seen": 1470050304 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028011033099297893, + "loss": 2.8782, + "theoretical_loss": 3.5223369994325533, + "tokens_seen": 1470115840 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002801003009027081, + "loss": 2.7488, + "theoretical_loss": 3.52232310181739, + "tokens_seen": 1470181376 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028009027081243735, + "loss": 2.7951, + "theoretical_loss": 3.52230920499518, + "tokens_seen": 1470246912 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028008024072216647, + "loss": 2.7757, + "theoretical_loss": 3.52229530896584, + "tokens_seen": 1470312448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002800702106318957, + "loss": 2.8047, + "theoretical_loss": 3.5222814137292917, + "tokens_seen": 1470377984 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028006018054162483, + "loss": 2.8718, + "theoretical_loss": 3.5222675192854536, + "tokens_seen": 1470443520 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028005015045135407, + "loss": 2.8069, + "theoretical_loss": 3.522253625634245, + "tokens_seen": 1470509056 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028004012036108325, + "loss": 2.8578, + "theoretical_loss": 3.522239732775585, + "tokens_seen": 1470574592 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028003009027081243, + "loss": 2.8151, + "theoretical_loss": 3.5222258407093943, + "tokens_seen": 1470640128 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028002006018054167, + "loss": 2.7486, + "theoretical_loss": 3.5222119494355915, + "tokens_seen": 1470705664 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002800100300902708, + "loss": 2.9033, + "theoretical_loss": 3.522198058954096, + "tokens_seen": 1470771200 + }, + { + "epoch": 4.03, + "learning_rate": 0.00028000000000000003, + "loss": 2.9689, + "theoretical_loss": 3.522184169264828, + "tokens_seen": 1470836736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002799899699097292, + "loss": 2.8822, + "theoretical_loss": 3.522170280367707, + "tokens_seen": 1470902272 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002799799398194584, + "loss": 2.8103, + "theoretical_loss": 3.5221563922626515, + "tokens_seen": 1470967808 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002799699097291876, + "loss": 2.9518, + "theoretical_loss": 3.522142504949582, + "tokens_seen": 1471033344 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027995987963891675, + "loss": 2.885, + "theoretical_loss": 3.5221286184284173, + "tokens_seen": 1471098880 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027994984954864594, + "loss": 2.7673, + "theoretical_loss": 3.522114732699078, + "tokens_seen": 1471164416 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2348962, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1568944454193115, + "objective/train/theoretical_loss": 3.522100847761483, + "objective/train/tokens_used": 1491689952, + "theoretical_loss": 3.522100847761483, + "tokens_seen": 1471229952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027993981945837517, + "loss": 2.9361, + "theoretical_loss": 3.522100847761483, + "tokens_seen": 1471229952 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002799297893681043, + "loss": 2.667, + "theoretical_loss": 3.5220869636155516, + "tokens_seen": 1471295488 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027991975927783353, + "loss": 2.8009, + "theoretical_loss": 3.522073080261204, + "tokens_seen": 1471361024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002799097291875627, + "loss": 2.7818, + "theoretical_loss": 3.52205919769836, + "tokens_seen": 1471426560 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798996990972919, + "loss": 2.8456, + "theoretical_loss": 3.5220453159269383, + "tokens_seen": 1471492096 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798896690070211, + "loss": 2.6996, + "theoretical_loss": 3.5220314349468596, + "tokens_seen": 1471557632 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027987963891675026, + "loss": 2.7635, + "theoretical_loss": 3.5220175547580426, + "tokens_seen": 1471623168 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027986960882647944, + "loss": 2.8482, + "theoretical_loss": 3.5220036753604074, + "tokens_seen": 1471688704 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798595787362087, + "loss": 2.6489, + "theoretical_loss": 3.521989796753874, + "tokens_seen": 1471754240 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798495486459378, + "loss": 2.7179, + "theoretical_loss": 3.5219759189383613, + "tokens_seen": 1471819776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027983951855566704, + "loss": 2.7155, + "theoretical_loss": 3.52196204191379, + "tokens_seen": 1471885312 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027982948846539616, + "loss": 2.7157, + "theoretical_loss": 3.5219481656800786, + "tokens_seen": 1471950848 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798194583751254, + "loss": 2.707, + "theoretical_loss": 3.521934290237148, + "tokens_seen": 1472016384 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002798094282848546, + "loss": 2.7333, + "theoretical_loss": 3.5219204155849173, + "tokens_seen": 1472081920 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027979939819458376, + "loss": 2.9406, + "theoretical_loss": 3.5219065417233058, + "tokens_seen": 1472147456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027978936810431294, + "loss": 2.84, + "theoretical_loss": 3.5218926686522343, + "tokens_seen": 1472212992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002797793380140422, + "loss": 2.8234, + "theoretical_loss": 3.521878796371622, + "tokens_seen": 1472278528 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002797693079237713, + "loss": 2.8091, + "theoretical_loss": 3.5218649248813887, + "tokens_seen": 1472344064 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027975927783350054, + "loss": 2.8796, + "theoretical_loss": 3.5218510541814543, + "tokens_seen": 1472409600 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027974924774322967, + "loss": 2.8603, + "theoretical_loss": 3.5218371842717384, + "tokens_seen": 1472475136 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002797392176529589, + "loss": 2.6571, + "theoretical_loss": 3.5218233151521607, + "tokens_seen": 1472540672 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002797291875626881, + "loss": 2.8244, + "theoretical_loss": 3.521809446822642, + "tokens_seen": 1472606208 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027971915747241726, + "loss": 2.777, + "theoretical_loss": 3.5217955792831006, + "tokens_seen": 1472671744 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027970912738214644, + "loss": 2.765, + "theoretical_loss": 3.521781712533458, + "tokens_seen": 1472737280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002796990972918756, + "loss": 2.8782, + "theoretical_loss": 3.5217678465736326, + "tokens_seen": 1472802816 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2350390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.70827054977417, + "objective/train/theoretical_loss": 3.521753981403545, + "objective/train/tokens_used": 1493328352, + "theoretical_loss": 3.521753981403545, + "tokens_seen": 1472868352 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002796890672016048, + "loss": 2.7363, + "theoretical_loss": 3.521753981403545, + "tokens_seen": 1472868352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027967903711133404, + "loss": 2.8097, + "theoretical_loss": 3.521740117023115, + "tokens_seen": 1472933888 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027966900702106317, + "loss": 2.8573, + "theoretical_loss": 3.521726253432263, + "tokens_seen": 1472999424 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002796589769307924, + "loss": 2.8872, + "theoretical_loss": 3.5217123906309076, + "tokens_seen": 1473064960 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027964894684052153, + "loss": 2.8386, + "theoretical_loss": 3.5216985286189697, + "tokens_seen": 1473130496 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027963891675025077, + "loss": 2.8281, + "theoretical_loss": 3.521684667396369, + "tokens_seen": 1473196032 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027962888665997995, + "loss": 2.697, + "theoretical_loss": 3.5216708069630256, + "tokens_seen": 1473261568 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027961885656970913, + "loss": 2.8933, + "theoretical_loss": 3.521656947318859, + "tokens_seen": 1473327104 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002796088264794383, + "loss": 2.8026, + "theoretical_loss": 3.52164308846379, + "tokens_seen": 1473392640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027959879638916755, + "loss": 2.6514, + "theoretical_loss": 3.5216292303977377, + "tokens_seen": 1473458176 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027958876629889667, + "loss": 2.8325, + "theoretical_loss": 3.521615373120623, + "tokens_seen": 1473523712 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002795787362086259, + "loss": 2.8412, + "theoretical_loss": 3.521601516632365, + "tokens_seen": 1473589248 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027956870611835503, + "loss": 2.825, + "theoretical_loss": 3.521587660932884, + "tokens_seen": 1473654784 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027955867602808427, + "loss": 2.7597, + "theoretical_loss": 3.5215738060221, + "tokens_seen": 1473720320 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027954864593781345, + "loss": 2.7543, + "theoretical_loss": 3.521559951899933, + "tokens_seen": 1473785856 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027953861584754263, + "loss": 2.8606, + "theoretical_loss": 3.521546098566303, + "tokens_seen": 1473851392 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002795285857572718, + "loss": 2.9186, + "theoretical_loss": 3.5215322460211307, + "tokens_seen": 1473916928 + }, + { + "epoch": 4.03, + "learning_rate": 0.000279518555667001, + "loss": 2.662, + "theoretical_loss": 3.5215183942643353, + "tokens_seen": 1473982464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002795085255767302, + "loss": 2.9405, + "theoretical_loss": 3.521504543295838, + "tokens_seen": 1474048000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794984954864594, + "loss": 2.7436, + "theoretical_loss": 3.521490693115558, + "tokens_seen": 1474113536 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027948846539618854, + "loss": 2.8392, + "theoretical_loss": 3.521476843723415, + "tokens_seen": 1474179072 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794784353059178, + "loss": 2.769, + "theoretical_loss": 3.5214629951193297, + "tokens_seen": 1474244608 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794684052156469, + "loss": 2.7754, + "theoretical_loss": 3.5214491473032226, + "tokens_seen": 1474310144 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027945837512537614, + "loss": 2.9342, + "theoretical_loss": 3.5214353002750136, + "tokens_seen": 1474375680 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794483450351053, + "loss": 2.8556, + "theoretical_loss": 3.5214214540346225, + "tokens_seen": 1474441216 + }, + { + "debugging/Self-BLEU-5": 0.6044662790077345, + "debugging/distinct-1-grams": 0.7715330083574553, + "debugging/distinct-2-grams": 0.9568035779610206, + "debugging/entropy-1-grams": 6.302726956732748, + "debugging/entropy-2-grams": 7.377334948022522, + "debugging/length": 533.0, + "debugging/num_segments": 23, + "epoch": 4.03, + "objective/train/docs_used": 2353212, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8401081562042236, + "objective/train/theoretical_loss": 3.5214076085819697, + "objective/train/tokens_used": 1494966752, + "theoretical_loss": 3.5214076085819697, + "tokens_seen": 1474506752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794383149448345, + "loss": 2.7978, + "theoretical_loss": 3.5214076085819697, + "tokens_seen": 1474506752 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794282848545637, + "loss": 2.7359, + "theoretical_loss": 3.5213937639169757, + "tokens_seen": 1474572288 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002794182547642929, + "loss": 2.8265, + "theoretical_loss": 3.5213799200395597, + "tokens_seen": 1474637824 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027940822467402204, + "loss": 2.7444, + "theoretical_loss": 3.5213660769496427, + "tokens_seen": 1474703360 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793981945837513, + "loss": 2.8828, + "theoretical_loss": 3.521352234647145, + "tokens_seen": 1474768896 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793881644934804, + "loss": 2.9573, + "theoretical_loss": 3.5213383931319866, + "tokens_seen": 1474834432 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027937813440320964, + "loss": 2.8013, + "theoretical_loss": 3.5213245524040877, + "tokens_seen": 1474899968 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793681043129388, + "loss": 2.8613, + "theoretical_loss": 3.521310712463368, + "tokens_seen": 1474965504 + }, + { + "epoch": 4.03, + "learning_rate": 0.000279358074222668, + "loss": 2.7953, + "theoretical_loss": 3.521296873309749, + "tokens_seen": 1475031040 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793480441323972, + "loss": 2.6219, + "theoretical_loss": 3.52128303494315, + "tokens_seen": 1475096576 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027933801404212636, + "loss": 2.7188, + "theoretical_loss": 3.5212691973634915, + "tokens_seen": 1475162112 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027932798395185554, + "loss": 2.7173, + "theoretical_loss": 3.521255360570694, + "tokens_seen": 1475227648 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793179538615848, + "loss": 2.6442, + "theoretical_loss": 3.5212415245646778, + "tokens_seen": 1475293184 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002793079237713139, + "loss": 2.7493, + "theoretical_loss": 3.5212276893453627, + "tokens_seen": 1475358720 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027929789368104314, + "loss": 2.8275, + "theoretical_loss": 3.5212138549126695, + "tokens_seen": 1475424256 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027928786359077227, + "loss": 2.7213, + "theoretical_loss": 3.5212000212665187, + "tokens_seen": 1475489792 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002792778335005015, + "loss": 2.7126, + "theoretical_loss": 3.52118618840683, + "tokens_seen": 1475555328 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027926780341023074, + "loss": 2.8647, + "theoretical_loss": 3.5211723563335244, + "tokens_seen": 1475620864 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027925777331995987, + "loss": 2.7984, + "theoretical_loss": 3.5211585250465216, + "tokens_seen": 1475686400 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002792477432296891, + "loss": 2.6119, + "theoretical_loss": 3.5211446945457427, + "tokens_seen": 1475751936 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002792377131394183, + "loss": 2.7815, + "theoretical_loss": 3.521130864831108, + "tokens_seen": 1475817472 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027922768304914746, + "loss": 2.7422, + "theoretical_loss": 3.521117035902537, + "tokens_seen": 1475883008 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027921765295887664, + "loss": 2.8148, + "theoretical_loss": 3.5211032077599516, + "tokens_seen": 1475948544 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002792076228686058, + "loss": 2.8235, + "theoretical_loss": 3.5210893804032715, + "tokens_seen": 1476014080 + }, + { + "epoch": 4.03, + "learning_rate": 0.000279197592778335, + "loss": 2.7891, + "theoretical_loss": 3.5210755538324165, + "tokens_seen": 1476079616 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2355969, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.682034969329834, + "objective/train/theoretical_loss": 3.521061728047308, + "objective/train/tokens_used": 1496605152, + "theoretical_loss": 3.521061728047308, + "tokens_seen": 1476145152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027918756268806424, + "loss": 2.7761, + "theoretical_loss": 3.521061728047308, + "tokens_seen": 1476145152 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027917753259779337, + "loss": 2.7277, + "theoretical_loss": 3.521047903047866, + "tokens_seen": 1476210688 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002791675025075226, + "loss": 2.8142, + "theoretical_loss": 3.521034078834011, + "tokens_seen": 1476276224 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027915747241725173, + "loss": 2.7299, + "theoretical_loss": 3.5210202554056638, + "tokens_seen": 1476341760 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027914744232698097, + "loss": 2.8504, + "theoretical_loss": 3.5210064327627446, + "tokens_seen": 1476407296 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027913741223671015, + "loss": 2.8639, + "theoretical_loss": 3.520992610905174, + "tokens_seen": 1476472832 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027912738214643933, + "loss": 2.7533, + "theoretical_loss": 3.5209787898328724, + "tokens_seen": 1476538368 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002791173520561685, + "loss": 2.6829, + "theoretical_loss": 3.5209649695457603, + "tokens_seen": 1476603904 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027910732196589775, + "loss": 2.8413, + "theoretical_loss": 3.520951150043759, + "tokens_seen": 1476669440 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027909729187562687, + "loss": 2.9413, + "theoretical_loss": 3.520937331326788, + "tokens_seen": 1476734976 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002790872617853561, + "loss": 2.8105, + "theoretical_loss": 3.5209235133947683, + "tokens_seen": 1476800512 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027907723169508523, + "loss": 2.7579, + "theoretical_loss": 3.520909696247621, + "tokens_seen": 1476866048 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027906720160481447, + "loss": 2.7414, + "theoretical_loss": 3.5208958798852654, + "tokens_seen": 1476931584 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027905717151454365, + "loss": 2.7999, + "theoretical_loss": 3.5208820643076235, + "tokens_seen": 1476997120 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027904714142427283, + "loss": 2.8206, + "theoretical_loss": 3.5208682495146153, + "tokens_seen": 1477062656 + }, + { + "epoch": 4.03, + "learning_rate": 0.000279037111334002, + "loss": 2.7923, + "theoretical_loss": 3.520854435506161, + "tokens_seen": 1477128192 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002790270812437312, + "loss": 2.635, + "theoretical_loss": 3.520840622282182, + "tokens_seen": 1477193728 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002790170511534604, + "loss": 2.6298, + "theoretical_loss": 3.5208268098425988, + "tokens_seen": 1477259264 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002790070210631896, + "loss": 2.8434, + "theoretical_loss": 3.5208129981873313, + "tokens_seen": 1477324800 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027899699097291874, + "loss": 2.8554, + "theoretical_loss": 3.5207991873163014, + "tokens_seen": 1477390336 + }, + { + "epoch": 4.03, + "learning_rate": 0.000278986960882648, + "loss": 2.7339, + "theoretical_loss": 3.520785377229429, + "tokens_seen": 1477455872 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789769307923771, + "loss": 2.8061, + "theoretical_loss": 3.5207715679266345, + "tokens_seen": 1477521408 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027896690070210634, + "loss": 2.8724, + "theoretical_loss": 3.5207577594078394, + "tokens_seen": 1477586944 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789568706118355, + "loss": 2.7196, + "theoretical_loss": 3.520743951672964, + "tokens_seen": 1477652480 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789468405215647, + "loss": 2.8111, + "theoretical_loss": 3.5207301447219295, + "tokens_seen": 1477718016 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2358390, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0252327919006348, + "objective/train/theoretical_loss": 3.520716338554656, + "objective/train/tokens_used": 1498243552, + "theoretical_loss": 3.520716338554656, + "tokens_seen": 1477783552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789368104312939, + "loss": 2.8829, + "theoretical_loss": 3.520716338554656, + "tokens_seen": 1477783552 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789267803410231, + "loss": 2.8956, + "theoretical_loss": 3.5207025331710646, + "tokens_seen": 1477849088 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027891675025075224, + "loss": 2.7708, + "theoretical_loss": 3.5206887285710757, + "tokens_seen": 1477914624 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002789067201604815, + "loss": 2.9448, + "theoretical_loss": 3.5206749247546103, + "tokens_seen": 1477980160 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002788966900702106, + "loss": 2.829, + "theoretical_loss": 3.5206611217215897, + "tokens_seen": 1478045696 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027888665997993984, + "loss": 2.7811, + "theoretical_loss": 3.5206473194719337, + "tokens_seen": 1478111232 + }, + { + "epoch": 4.03, + "learning_rate": 0.000278876629889669, + "loss": 2.7782, + "theoretical_loss": 3.5206335180055643, + "tokens_seen": 1478176768 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002788665997993982, + "loss": 2.754, + "theoretical_loss": 3.5206197173224014, + "tokens_seen": 1478242304 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002788565697091274, + "loss": 2.8169, + "theoretical_loss": 3.520605917422366, + "tokens_seen": 1478307840 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027884653961885656, + "loss": 2.9033, + "theoretical_loss": 3.520592118305379, + "tokens_seen": 1478373376 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027883650952858574, + "loss": 2.6348, + "theoretical_loss": 3.5205783199713614, + "tokens_seen": 1478438912 + }, + { + "epoch": 4.03, + "learning_rate": 0.000278826479438315, + "loss": 2.9641, + "theoretical_loss": 3.520564522420234, + "tokens_seen": 1478504448 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002788164493480441, + "loss": 2.6649, + "theoretical_loss": 3.5205507256519177, + "tokens_seen": 1478569984 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027880641925777334, + "loss": 2.8509, + "theoretical_loss": 3.520536929666333, + "tokens_seen": 1478635520 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027879638916750247, + "loss": 2.746, + "theoretical_loss": 3.520523134463402, + "tokens_seen": 1478701056 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002787863590772317, + "loss": 2.8607, + "theoretical_loss": 3.520509340043044, + "tokens_seen": 1478766592 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002787763289869609, + "loss": 2.7434, + "theoretical_loss": 3.520495546405181, + "tokens_seen": 1478832128 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027876629889669007, + "loss": 2.8499, + "theoretical_loss": 3.520481753549734, + "tokens_seen": 1478897664 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027875626880641925, + "loss": 2.885, + "theoretical_loss": 3.5204679614766228, + "tokens_seen": 1478963200 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002787462387161485, + "loss": 2.64, + "theoretical_loss": 3.5204541701857694, + "tokens_seen": 1479028736 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002787362086258776, + "loss": 2.8516, + "theoretical_loss": 3.520440379677095, + "tokens_seen": 1479094272 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027872617853560685, + "loss": 2.6684, + "theoretical_loss": 3.52042658995052, + "tokens_seen": 1479159808 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027871614844533597, + "loss": 2.8902, + "theoretical_loss": 3.520412801005965, + "tokens_seen": 1479225344 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002787061183550652, + "loss": 2.7825, + "theoretical_loss": 3.5203990128433515, + "tokens_seen": 1479290880 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002786960882647944, + "loss": 2.7777, + "theoretical_loss": 3.5203852254626016, + "tokens_seen": 1479356416 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2361177, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9003305435180664, + "objective/train/theoretical_loss": 3.5203714388636342, + "objective/train/tokens_used": 1499881952, + "theoretical_loss": 3.5203714388636342, + "tokens_seen": 1479421952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027868605817452357, + "loss": 2.6516, + "theoretical_loss": 3.5203714388636342, + "tokens_seen": 1479421952 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027867602808425275, + "loss": 2.9187, + "theoretical_loss": 3.520357653046372, + "tokens_seen": 1479487488 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027866599799398193, + "loss": 2.809, + "theoretical_loss": 3.520343868010735, + "tokens_seen": 1479553024 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002786559679037111, + "loss": 2.7919, + "theoretical_loss": 3.5203300837566447, + "tokens_seen": 1479618560 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027864593781344035, + "loss": 2.8535, + "theoretical_loss": 3.5203163002840228, + "tokens_seen": 1479684096 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002786359077231695, + "loss": 2.8477, + "theoretical_loss": 3.520302517592789, + "tokens_seen": 1479749632 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002786258776328987, + "loss": 2.9128, + "theoretical_loss": 3.5202887356828656, + "tokens_seen": 1479815168 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002786158475426279, + "loss": 2.7241, + "theoretical_loss": 3.5202749545541736, + "tokens_seen": 1479880704 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027860581745235707, + "loss": 2.8358, + "theoretical_loss": 3.5202611742066336, + "tokens_seen": 1479946240 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027859578736208625, + "loss": 2.8813, + "theoretical_loss": 3.520247394640167, + "tokens_seen": 1480011776 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027858575727181544, + "loss": 2.8951, + "theoretical_loss": 3.5202336158546945, + "tokens_seen": 1480077312 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002785757271815446, + "loss": 2.8045, + "theoretical_loss": 3.5202198378501377, + "tokens_seen": 1480142848 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027856569709127385, + "loss": 2.6576, + "theoretical_loss": 3.5202060606264185, + "tokens_seen": 1480208384 + }, + { + "epoch": 4.03, + "learning_rate": 0.000278555667001003, + "loss": 2.637, + "theoretical_loss": 3.5201922841834565, + "tokens_seen": 1480273920 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002785456369107322, + "loss": 2.6698, + "theoretical_loss": 3.520178508521174, + "tokens_seen": 1480339456 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027853560682046134, + "loss": 2.5496, + "theoretical_loss": 3.520164733639492, + "tokens_seen": 1480404992 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002785255767301906, + "loss": 2.6445, + "theoretical_loss": 3.5201509595383316, + "tokens_seen": 1480470528 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002785155466399198, + "loss": 2.8242, + "theoretical_loss": 3.520137186217614, + "tokens_seen": 1480536064 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027850551654964894, + "loss": 2.922, + "theoretical_loss": 3.5201234136772603, + "tokens_seen": 1480601600 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784954864593782, + "loss": 2.7629, + "theoretical_loss": 3.520109641917192, + "tokens_seen": 1480667136 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784854563691073, + "loss": 2.8517, + "theoretical_loss": 3.5200958709373302, + "tokens_seen": 1480732672 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027847542627883654, + "loss": 2.8235, + "theoretical_loss": 3.5200821007375964, + "tokens_seen": 1480798208 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784653961885657, + "loss": 2.9115, + "theoretical_loss": 3.5200683313179115, + "tokens_seen": 1480863744 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784553660982949, + "loss": 2.7815, + "theoretical_loss": 3.5200545626781974, + "tokens_seen": 1480929280 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784453360080241, + "loss": 2.7341, + "theoretical_loss": 3.520040794818375, + "tokens_seen": 1480994816 + }, + { + "epoch": 4.03, + "objective/train/docs_used": 2363744, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.354550361633301, + "objective/train/theoretical_loss": 3.520027027738365, + "objective/train/tokens_used": 1501520352, + "theoretical_loss": 3.520027027738365, + "tokens_seen": 1481060352 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784353059177533, + "loss": 2.7948, + "theoretical_loss": 3.520027027738365, + "tokens_seen": 1481060352 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027842527582748244, + "loss": 2.7524, + "theoretical_loss": 3.5200132614380903, + "tokens_seen": 1481125888 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784152457372117, + "loss": 2.8474, + "theoretical_loss": 3.519999495917471, + "tokens_seen": 1481191424 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002784052156469408, + "loss": 2.7619, + "theoretical_loss": 3.5199857311764284, + "tokens_seen": 1481256960 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027839518555667004, + "loss": 2.6203, + "theoretical_loss": 3.519971967214884, + "tokens_seen": 1481322496 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002783851554663992, + "loss": 2.8149, + "theoretical_loss": 3.5199582040327604, + "tokens_seen": 1481388032 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002783751253761284, + "loss": 2.7614, + "theoretical_loss": 3.5199444416299777, + "tokens_seen": 1481453568 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002783650952858576, + "loss": 2.6002, + "theoretical_loss": 3.519930680006457, + "tokens_seen": 1481519104 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027835506519558676, + "loss": 2.765, + "theoretical_loss": 3.519916919162121, + "tokens_seen": 1481584640 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027834503510531594, + "loss": 2.9668, + "theoretical_loss": 3.5199031590968897, + "tokens_seen": 1481650176 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002783350050150452, + "loss": 2.6496, + "theoretical_loss": 3.5198893998106855, + "tokens_seen": 1481715712 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002783249749247743, + "loss": 2.6787, + "theoretical_loss": 3.51987564130343, + "tokens_seen": 1481781248 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027831494483450354, + "loss": 2.8707, + "theoretical_loss": 3.519861883575044, + "tokens_seen": 1481846784 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027830491474423267, + "loss": 2.8089, + "theoretical_loss": 3.519848126625449, + "tokens_seen": 1481912320 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002782948846539619, + "loss": 2.7933, + "theoretical_loss": 3.519834370454567, + "tokens_seen": 1481977856 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002782848545636911, + "loss": 2.7841, + "theoretical_loss": 3.5198206150623186, + "tokens_seen": 1482043392 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027827482447342027, + "loss": 2.7144, + "theoretical_loss": 3.519806860448626, + "tokens_seen": 1482108928 + }, + { + "epoch": 4.03, + "learning_rate": 0.00027826479438314945, + "loss": 2.7998, + "theoretical_loss": 3.5197931066134114, + "tokens_seen": 1482174464 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002782547642928787, + "loss": 2.7389, + "theoretical_loss": 3.5197793535565944, + "tokens_seen": 1482240000 + }, + { + "epoch": 4.03, + "learning_rate": 0.0002782447342026078, + "loss": 2.8094, + "theoretical_loss": 3.5197656012780985, + "tokens_seen": 1482305536 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027823470411233705, + "loss": 2.6987, + "theoretical_loss": 3.519751849777844, + "tokens_seen": 1482371072 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027822467402206617, + "loss": 2.8847, + "theoretical_loss": 3.5197380990557527, + "tokens_seen": 1482436608 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002782146439317954, + "loss": 2.778, + "theoretical_loss": 3.5197243491117467, + "tokens_seen": 1482502144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002782046138415246, + "loss": 2.7704, + "theoretical_loss": 3.5197105999457468, + "tokens_seen": 1482567680 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027819458375125377, + "loss": 2.7205, + "theoretical_loss": 3.5196968515576748, + "tokens_seen": 1482633216 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2366625, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6894354820251465, + "objective/train/theoretical_loss": 3.519683103947453, + "objective/train/tokens_used": 1503158752, + "theoretical_loss": 3.519683103947453, + "tokens_seen": 1482698752 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027818455366098295, + "loss": 2.7507, + "theoretical_loss": 3.519683103947453, + "tokens_seen": 1482698752 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027817452357071213, + "loss": 2.9005, + "theoretical_loss": 3.5196693571150024, + "tokens_seen": 1482764288 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002781644934804413, + "loss": 2.7894, + "theoretical_loss": 3.5196556110602444, + "tokens_seen": 1482829824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027815446339017055, + "loss": 2.9726, + "theoretical_loss": 3.5196418657831012, + "tokens_seen": 1482895360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002781444332998997, + "loss": 2.8519, + "theoretical_loss": 3.519628121283494, + "tokens_seen": 1482960896 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002781344032096289, + "loss": 2.8613, + "theoretical_loss": 3.5196143775613447, + "tokens_seen": 1483026432 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002781243731193581, + "loss": 2.7898, + "theoretical_loss": 3.519600634616575, + "tokens_seen": 1483091968 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002781143430290873, + "loss": 2.6697, + "theoretical_loss": 3.5195868924491065, + "tokens_seen": 1483157504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027810431293881645, + "loss": 2.7993, + "theoretical_loss": 3.519573151058861, + "tokens_seen": 1483223040 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027809428284854564, + "loss": 2.811, + "theoretical_loss": 3.5195594104457597, + "tokens_seen": 1483288576 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002780842527582748, + "loss": 2.8681, + "theoretical_loss": 3.519545670609725, + "tokens_seen": 1483354112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027807422266800405, + "loss": 2.8212, + "theoretical_loss": 3.519531931550678, + "tokens_seen": 1483419648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002780641925777332, + "loss": 2.8258, + "theoretical_loss": 3.5195181932685413, + "tokens_seen": 1483485184 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002780541624874624, + "loss": 2.8419, + "theoretical_loss": 3.5195044557632356, + "tokens_seen": 1483550720 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027804413239719154, + "loss": 2.8633, + "theoretical_loss": 3.5194907190346836, + "tokens_seen": 1483616256 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002780341023069208, + "loss": 2.5824, + "theoretical_loss": 3.5194769830828063, + "tokens_seen": 1483681792 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027802407221664996, + "loss": 2.9367, + "theoretical_loss": 3.5194632479075265, + "tokens_seen": 1483747328 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027801404212637914, + "loss": 2.8979, + "theoretical_loss": 3.5194495135087647, + "tokens_seen": 1483812864 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002780040120361083, + "loss": 2.6613, + "theoretical_loss": 3.5194357798864435, + "tokens_seen": 1483878400 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779939819458375, + "loss": 2.7676, + "theoretical_loss": 3.5194220470404844, + "tokens_seen": 1483943936 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779839518555667, + "loss": 2.727, + "theoretical_loss": 3.5194083149708097, + "tokens_seen": 1484009472 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779739217652959, + "loss": 2.7607, + "theoretical_loss": 3.519394583677341, + "tokens_seen": 1484075008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027796389167502504, + "loss": 2.704, + "theoretical_loss": 3.5193808531599995, + "tokens_seen": 1484140544 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779538615847543, + "loss": 2.7646, + "theoretical_loss": 3.519367123418708, + "tokens_seen": 1484206080 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027794383149448346, + "loss": 2.9057, + "theoretical_loss": 3.519353394453388, + "tokens_seen": 1484271616 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2368081, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6427218914031982, + "objective/train/theoretical_loss": 3.5193396662639613, + "objective/train/tokens_used": 1504797152, + "theoretical_loss": 3.5193396662639613, + "tokens_seen": 1484337152 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027793380140421264, + "loss": 2.8504, + "theoretical_loss": 3.5193396662639613, + "tokens_seen": 1484337152 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779237713139418, + "loss": 2.8032, + "theoretical_loss": 3.5193259388503497, + "tokens_seen": 1484402688 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277913741223671, + "loss": 2.7264, + "theoretical_loss": 3.519312212212476, + "tokens_seen": 1484468224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002779037111334002, + "loss": 2.9742, + "theoretical_loss": 3.5192984863502605, + "tokens_seen": 1484533760 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778936810431294, + "loss": 2.8125, + "theoretical_loss": 3.519284761263627, + "tokens_seen": 1484599296 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027788365095285855, + "loss": 2.8463, + "theoretical_loss": 3.519271036952496, + "tokens_seen": 1484664832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778736208625878, + "loss": 2.6407, + "theoretical_loss": 3.51925731341679, + "tokens_seen": 1484730368 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778635907723169, + "loss": 2.8491, + "theoretical_loss": 3.5192435906564308, + "tokens_seen": 1484795904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027785356068204614, + "loss": 2.778, + "theoretical_loss": 3.5192298686713404, + "tokens_seen": 1484861440 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778435305917753, + "loss": 2.8279, + "theoretical_loss": 3.519216147461441, + "tokens_seen": 1484926976 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778335005015045, + "loss": 2.7881, + "theoretical_loss": 3.5192024270266544, + "tokens_seen": 1484992512 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002778234704112337, + "loss": 2.6614, + "theoretical_loss": 3.5191887073669026, + "tokens_seen": 1485058048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027781344032096287, + "loss": 2.8118, + "theoretical_loss": 3.519174988482108, + "tokens_seen": 1485123584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027780341023069205, + "loss": 2.8373, + "theoretical_loss": 3.519161270372192, + "tokens_seen": 1485189120 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002777933801404213, + "loss": 2.8495, + "theoretical_loss": 3.519147553037077, + "tokens_seen": 1485254656 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002777833500501504, + "loss": 2.7823, + "theoretical_loss": 3.5191338364766853, + "tokens_seen": 1485320192 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027777331995987965, + "loss": 2.8346, + "theoretical_loss": 3.5191201206909386, + "tokens_seen": 1485385728 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002777632898696089, + "loss": 2.6173, + "theoretical_loss": 3.519106405679759, + "tokens_seen": 1485451264 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277753259779338, + "loss": 2.8413, + "theoretical_loss": 3.5190926914430687, + "tokens_seen": 1485516800 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027774322968906725, + "loss": 2.7795, + "theoretical_loss": 3.5190789779807896, + "tokens_seen": 1485582336 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027773319959879637, + "loss": 2.7657, + "theoretical_loss": 3.519065265292844, + "tokens_seen": 1485647872 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002777231695085256, + "loss": 2.7846, + "theoretical_loss": 3.519051553379154, + "tokens_seen": 1485713408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002777131394182548, + "loss": 2.7505, + "theoretical_loss": 3.5190378422396416, + "tokens_seen": 1485778944 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027770310932798397, + "loss": 2.6807, + "theoretical_loss": 3.5190241318742297, + "tokens_seen": 1485844480 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027769307923771315, + "loss": 2.8083, + "theoretical_loss": 3.519010422282839, + "tokens_seen": 1485910016 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2371006, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.675480365753174, + "objective/train/theoretical_loss": 3.518996713465393, + "objective/train/tokens_used": 1506435552, + "theoretical_loss": 3.518996713465393, + "tokens_seen": 1485975552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027768304914744233, + "loss": 2.697, + "theoretical_loss": 3.518996713465393, + "tokens_seen": 1485975552 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002776730190571715, + "loss": 2.7692, + "theoretical_loss": 3.518983005421813, + "tokens_seen": 1486041088 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027766298896690075, + "loss": 2.7966, + "theoretical_loss": 3.518969298152022, + "tokens_seen": 1486106624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002776529588766299, + "loss": 2.7963, + "theoretical_loss": 3.518955591655941, + "tokens_seen": 1486172160 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002776429287863591, + "loss": 2.7115, + "theoretical_loss": 3.518941885933494, + "tokens_seen": 1486237696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002776328986960883, + "loss": 2.7703, + "theoretical_loss": 3.5189281809846014, + "tokens_seen": 1486303232 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002776228686058175, + "loss": 2.8583, + "theoretical_loss": 3.5189144768091865, + "tokens_seen": 1486368768 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027761283851554665, + "loss": 2.9656, + "theoretical_loss": 3.5189007734071716, + "tokens_seen": 1486434304 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027760280842527584, + "loss": 2.7896, + "theoretical_loss": 3.518887070778478, + "tokens_seen": 1486499840 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277592778335005, + "loss": 2.6554, + "theoretical_loss": 3.518873368923029, + "tokens_seen": 1486565376 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027758274824473425, + "loss": 2.9215, + "theoretical_loss": 3.518859667840746, + "tokens_seen": 1486630912 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002775727181544634, + "loss": 2.8353, + "theoretical_loss": 3.5188459675315524, + "tokens_seen": 1486696448 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002775626880641926, + "loss": 2.7863, + "theoretical_loss": 3.5188322679953696, + "tokens_seen": 1486761984 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027755265797392174, + "loss": 2.7213, + "theoretical_loss": 3.5188185692321206, + "tokens_seen": 1486827520 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277542627883651, + "loss": 2.8045, + "theoretical_loss": 3.518804871241727, + "tokens_seen": 1486893056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027753259779338016, + "loss": 2.9394, + "theoretical_loss": 3.518791174024111, + "tokens_seen": 1486958592 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027752256770310934, + "loss": 2.8797, + "theoretical_loss": 3.518777477579196, + "tokens_seen": 1487024128 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002775125376128385, + "loss": 2.833, + "theoretical_loss": 3.518763781906903, + "tokens_seen": 1487089664 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002775025075225677, + "loss": 2.7541, + "theoretical_loss": 3.518750087007156, + "tokens_seen": 1487155200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774924774322969, + "loss": 2.9504, + "theoretical_loss": 3.518736392879876, + "tokens_seen": 1487220736 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774824473420261, + "loss": 2.8584, + "theoretical_loss": 3.518722699524986, + "tokens_seen": 1487286272 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027747241725175524, + "loss": 2.7985, + "theoretical_loss": 3.518709006942408, + "tokens_seen": 1487351808 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774623871614845, + "loss": 2.7501, + "theoretical_loss": 3.5186953151320655, + "tokens_seen": 1487417344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027745235707121366, + "loss": 2.8078, + "theoretical_loss": 3.518681624093879, + "tokens_seen": 1487482880 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027744232698094284, + "loss": 2.6989, + "theoretical_loss": 3.518667933827773, + "tokens_seen": 1487548416 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2373648, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.910698890686035, + "objective/train/theoretical_loss": 3.5186542443336686, + "objective/train/tokens_used": 1508073952, + "theoretical_loss": 3.5186542443336686, + "tokens_seen": 1487613952 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277432296890672, + "loss": 2.8907, + "theoretical_loss": 3.5186542443336686, + "tokens_seen": 1487613952 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774222668004012, + "loss": 2.8532, + "theoretical_loss": 3.518640555611489, + "tokens_seen": 1487679488 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774122367101304, + "loss": 2.7111, + "theoretical_loss": 3.518626867661156, + "tokens_seen": 1487745024 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002774022066198596, + "loss": 2.834, + "theoretical_loss": 3.5186131804825926, + "tokens_seen": 1487810560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027739217652958875, + "loss": 2.69, + "theoretical_loss": 3.518599494075721, + "tokens_seen": 1487876096 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277382146439318, + "loss": 2.8219, + "theoretical_loss": 3.5185858084404638, + "tokens_seen": 1487941632 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002773721163490471, + "loss": 2.8215, + "theoretical_loss": 3.5185721235767433, + "tokens_seen": 1488007168 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027736208625877634, + "loss": 2.8751, + "theoretical_loss": 3.518558439484483, + "tokens_seen": 1488072704 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002773520561685055, + "loss": 2.8029, + "theoretical_loss": 3.518544756163604, + "tokens_seen": 1488138240 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002773420260782347, + "loss": 2.9375, + "theoretical_loss": 3.51853107361403, + "tokens_seen": 1488203776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002773319959879639, + "loss": 2.9096, + "theoretical_loss": 3.518517391835683, + "tokens_seen": 1488269312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027732196589769307, + "loss": 2.7763, + "theoretical_loss": 3.518503710828486, + "tokens_seen": 1488334848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027731193580742225, + "loss": 2.8702, + "theoretical_loss": 3.5184900305923605, + "tokens_seen": 1488400384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002773019057171515, + "loss": 2.8045, + "theoretical_loss": 3.5184763511272306, + "tokens_seen": 1488465920 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002772918756268806, + "loss": 2.7983, + "theoretical_loss": 3.5184626724330177, + "tokens_seen": 1488531456 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027728184553660985, + "loss": 2.792, + "theoretical_loss": 3.518448994509645, + "tokens_seen": 1488596992 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027727181544633903, + "loss": 2.6726, + "theoretical_loss": 3.518435317357035, + "tokens_seen": 1488662528 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002772617853560682, + "loss": 2.7989, + "theoretical_loss": 3.5184216409751103, + "tokens_seen": 1488728064 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002772517552657974, + "loss": 2.8158, + "theoretical_loss": 3.518407965363794, + "tokens_seen": 1488793600 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027724172517552657, + "loss": 2.9921, + "theoretical_loss": 3.5183942905230077, + "tokens_seen": 1488859136 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027723169508525575, + "loss": 2.7238, + "theoretical_loss": 3.518380616452675, + "tokens_seen": 1488924672 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277221664994985, + "loss": 2.6784, + "theoretical_loss": 3.518366943152719, + "tokens_seen": 1488990208 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002772116349047141, + "loss": 2.8486, + "theoretical_loss": 3.5183532706230602, + "tokens_seen": 1489055744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027720160481444335, + "loss": 2.7886, + "theoretical_loss": 3.518339598863624, + "tokens_seen": 1489121280 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771915747241725, + "loss": 2.7608, + "theoretical_loss": 3.518325927874332, + "tokens_seen": 1489186816 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2376435, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.709365129470825, + "objective/train/theoretical_loss": 3.518312257655106, + "objective/train/tokens_used": 1509712352, + "theoretical_loss": 3.518312257655106, + "tokens_seen": 1489252352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771815446339017, + "loss": 2.725, + "theoretical_loss": 3.518312257655106, + "tokens_seen": 1489252352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771715145436309, + "loss": 2.885, + "theoretical_loss": 3.5182985882058704, + "tokens_seen": 1489317888 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771614844533601, + "loss": 2.8115, + "theoretical_loss": 3.518284919526547, + "tokens_seen": 1489383424 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027715145436308926, + "loss": 2.6317, + "theoretical_loss": 3.5182712516170582, + "tokens_seen": 1489448960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771414242728185, + "loss": 2.7665, + "theoretical_loss": 3.518257584477327, + "tokens_seen": 1489514496 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771313941825476, + "loss": 2.7597, + "theoretical_loss": 3.5182439181072773, + "tokens_seen": 1489580032 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027712136409227685, + "loss": 2.7397, + "theoretical_loss": 3.518230252506831, + "tokens_seen": 1489645568 + }, + { + "epoch": 4.04, + "learning_rate": 0.000277111334002006, + "loss": 2.8003, + "theoretical_loss": 3.5182165876759104, + "tokens_seen": 1489711104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002771013039117352, + "loss": 2.7256, + "theoretical_loss": 3.5182029236144396, + "tokens_seen": 1489776640 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770912738214644, + "loss": 2.9139, + "theoretical_loss": 3.51818926032234, + "tokens_seen": 1489842176 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770812437311936, + "loss": 2.7615, + "theoretical_loss": 3.5181755977995355, + "tokens_seen": 1489907712 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027707121364092276, + "loss": 2.9001, + "theoretical_loss": 3.5181619360459484, + "tokens_seen": 1489973248 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027706118355065194, + "loss": 2.7179, + "theoretical_loss": 3.5181482750615016, + "tokens_seen": 1490038784 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770511534603811, + "loss": 2.7856, + "theoretical_loss": 3.518134614846119, + "tokens_seen": 1490104320 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027704112337011036, + "loss": 2.7952, + "theoretical_loss": 3.5181209553997217, + "tokens_seen": 1490169856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770310932798395, + "loss": 3.0025, + "theoretical_loss": 3.518107296722234, + "tokens_seen": 1490235392 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770210631895687, + "loss": 2.8967, + "theoretical_loss": 3.518093638813578, + "tokens_seen": 1490300928 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770110330992979, + "loss": 2.9293, + "theoretical_loss": 3.5180799816736767, + "tokens_seen": 1490366464 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002770010030090271, + "loss": 2.8376, + "theoretical_loss": 3.5180663253024536, + "tokens_seen": 1490432000 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769909729187563, + "loss": 2.9232, + "theoretical_loss": 3.5180526696998315, + "tokens_seen": 1490497536 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027698094282848544, + "loss": 2.7323, + "theoretical_loss": 3.5180390148657326, + "tokens_seen": 1490563072 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769709127382147, + "loss": 2.8289, + "theoretical_loss": 3.518025360800081, + "tokens_seen": 1490628608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027696088264794386, + "loss": 2.7967, + "theoretical_loss": 3.518011707502799, + "tokens_seen": 1490694144 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027695085255767304, + "loss": 2.6674, + "theoretical_loss": 3.517998054973809, + "tokens_seen": 1490759680 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769408224674022, + "loss": 2.7284, + "theoretical_loss": 3.517984403213035, + "tokens_seen": 1490825216 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2379105, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.967362880706787, + "objective/train/theoretical_loss": 3.5179707522204, + "objective/train/tokens_used": 1511350752, + "theoretical_loss": 3.5179707522204, + "tokens_seen": 1490890752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769307923771314, + "loss": 2.7098, + "theoretical_loss": 3.5179707522204, + "tokens_seen": 1490890752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769207622868606, + "loss": 2.8612, + "theoretical_loss": 3.517957101995826, + "tokens_seen": 1490956288 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002769107321965898, + "loss": 2.9264, + "theoretical_loss": 3.5179434525392375, + "tokens_seen": 1491021824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027690070210631895, + "loss": 2.9519, + "theoretical_loss": 3.5179298038505564, + "tokens_seen": 1491087360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768906720160482, + "loss": 2.8284, + "theoretical_loss": 3.517916155929706, + "tokens_seen": 1491152896 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768806419257773, + "loss": 2.8644, + "theoretical_loss": 3.5179025087766096, + "tokens_seen": 1491218432 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027687061183550655, + "loss": 2.9394, + "theoretical_loss": 3.5178888623911897, + "tokens_seen": 1491283968 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768605817452357, + "loss": 2.695, + "theoretical_loss": 3.51787521677337, + "tokens_seen": 1491349504 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768505516549649, + "loss": 2.781, + "theoretical_loss": 3.5178615719230737, + "tokens_seen": 1491415040 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768405215646941, + "loss": 2.7823, + "theoretical_loss": 3.5178479278402235, + "tokens_seen": 1491480576 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027683049147442327, + "loss": 2.7656, + "theoretical_loss": 3.5178342845247426, + "tokens_seen": 1491546112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027682046138415245, + "loss": 2.9304, + "theoretical_loss": 3.517820641976554, + "tokens_seen": 1491611648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768104312938817, + "loss": 2.7628, + "theoretical_loss": 3.517807000195581, + "tokens_seen": 1491677184 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002768004012036108, + "loss": 2.7854, + "theoretical_loss": 3.5177933591817467, + "tokens_seen": 1491742720 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027679037111334005, + "loss": 2.7907, + "theoretical_loss": 3.5177797189349747, + "tokens_seen": 1491808256 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027678034102306923, + "loss": 2.8948, + "theoretical_loss": 3.5177660794551873, + "tokens_seen": 1491873792 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002767703109327984, + "loss": 2.7267, + "theoretical_loss": 3.5177524407423086, + "tokens_seen": 1491939328 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002767602808425276, + "loss": 2.7925, + "theoretical_loss": 3.517738802796261, + "tokens_seen": 1492004864 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027675025075225677, + "loss": 2.8089, + "theoretical_loss": 3.517725165616968, + "tokens_seen": 1492070400 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027674022066198595, + "loss": 2.8926, + "theoretical_loss": 3.5177115292043535, + "tokens_seen": 1492135936 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002767301905717152, + "loss": 2.7911, + "theoretical_loss": 3.5176978935583394, + "tokens_seen": 1492201472 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002767201604814443, + "loss": 2.6493, + "theoretical_loss": 3.51768425867885, + "tokens_seen": 1492267008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027671013039117355, + "loss": 2.6302, + "theoretical_loss": 3.517670624565808, + "tokens_seen": 1492332544 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002767001003009027, + "loss": 2.8003, + "theoretical_loss": 3.5176569912191367, + "tokens_seen": 1492398080 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766900702106319, + "loss": 2.8425, + "theoretical_loss": 3.51764335863876, + "tokens_seen": 1492463616 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2381939, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7817158699035645, + "objective/train/theoretical_loss": 3.5176297268246, + "objective/train/tokens_used": 1512989152, + "theoretical_loss": 3.5176297268246, + "tokens_seen": 1492529152 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766800401203611, + "loss": 2.837, + "theoretical_loss": 3.5176297268246, + "tokens_seen": 1492529152 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766700100300903, + "loss": 2.7152, + "theoretical_loss": 3.5176160957765816, + "tokens_seen": 1492594688 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027665997993981946, + "loss": 2.952, + "theoretical_loss": 3.5176024654946265, + "tokens_seen": 1492660224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766499498495487, + "loss": 2.825, + "theoretical_loss": 3.5175888359786587, + "tokens_seen": 1492725760 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766399197592778, + "loss": 2.8621, + "theoretical_loss": 3.517575207228602, + "tokens_seen": 1492791296 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027662988966900705, + "loss": 2.8823, + "theoretical_loss": 3.5175615792443793, + "tokens_seen": 1492856832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766198595787362, + "loss": 2.7898, + "theoretical_loss": 3.517547952025913, + "tokens_seen": 1492922368 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002766098294884654, + "loss": 2.5678, + "theoretical_loss": 3.5175343255731284, + "tokens_seen": 1492987904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765997993981946, + "loss": 2.7555, + "theoretical_loss": 3.5175206998859476, + "tokens_seen": 1493053440 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765897693079238, + "loss": 2.6258, + "theoretical_loss": 3.517507074964294, + "tokens_seen": 1493118976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027657973921765296, + "loss": 2.891, + "theoretical_loss": 3.5174934508080913, + "tokens_seen": 1493184512 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027656970912738214, + "loss": 2.8854, + "theoretical_loss": 3.517479827417263, + "tokens_seen": 1493250048 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765596790371113, + "loss": 2.8298, + "theoretical_loss": 3.517466204791732, + "tokens_seen": 1493315584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027654964894684056, + "loss": 2.8243, + "theoretical_loss": 3.5174525829314227, + "tokens_seen": 1493381120 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765396188565697, + "loss": 2.7412, + "theoretical_loss": 3.517438961836257, + "tokens_seen": 1493446656 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765295887662989, + "loss": 2.768, + "theoretical_loss": 3.5174253415061605, + "tokens_seen": 1493512192 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027651955867602805, + "loss": 2.8708, + "theoretical_loss": 3.5174117219410546, + "tokens_seen": 1493577728 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002765095285857573, + "loss": 2.6959, + "theoretical_loss": 3.517398103140864, + "tokens_seen": 1493643264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027649949849548646, + "loss": 2.6768, + "theoretical_loss": 3.517384485105511, + "tokens_seen": 1493708800 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027648946840521564, + "loss": 2.8081, + "theoretical_loss": 3.5173708678349205, + "tokens_seen": 1493774336 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002764794383149448, + "loss": 2.8854, + "theoretical_loss": 3.517357251329015, + "tokens_seen": 1493839872 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027646940822467406, + "loss": 2.8304, + "theoretical_loss": 3.517343635587719, + "tokens_seen": 1493905408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002764593781344032, + "loss": 2.6288, + "theoretical_loss": 3.517330020610955, + "tokens_seen": 1493970944 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002764493480441324, + "loss": 2.7445, + "theoretical_loss": 3.517316406398647, + "tokens_seen": 1494036480 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027643931795386155, + "loss": 2.8036, + "theoretical_loss": 3.5173027929507183, + "tokens_seen": 1494102016 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2384882, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.955392837524414, + "objective/train/theoretical_loss": 3.5172891802670927, + "objective/train/tokens_used": 1514627552, + "theoretical_loss": 3.5172891802670927, + "tokens_seen": 1494167552 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002764292878635908, + "loss": 2.8283, + "theoretical_loss": 3.5172891802670927, + "tokens_seen": 1494167552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027641925777331997, + "loss": 2.9398, + "theoretical_loss": 3.517275568347694, + "tokens_seen": 1494233088 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027640922768304915, + "loss": 2.6832, + "theoretical_loss": 3.5172619571924453, + "tokens_seen": 1494298624 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027639919759277833, + "loss": 2.8957, + "theoretical_loss": 3.5172483468012703, + "tokens_seen": 1494364160 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002763891675025075, + "loss": 2.8244, + "theoretical_loss": 3.5172347371740926, + "tokens_seen": 1494429696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002763791374122367, + "loss": 2.8103, + "theoretical_loss": 3.5172211283108363, + "tokens_seen": 1494495232 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002763691073219659, + "loss": 2.8254, + "theoretical_loss": 3.5172075202114246, + "tokens_seen": 1494560768 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027635907723169505, + "loss": 2.8713, + "theoretical_loss": 3.517193912875781, + "tokens_seen": 1494626304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002763490471414243, + "loss": 2.9366, + "theoretical_loss": 3.517180306303829, + "tokens_seen": 1494691840 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002763390170511534, + "loss": 2.8366, + "theoretical_loss": 3.517166700495493, + "tokens_seen": 1494757376 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027632898696088265, + "loss": 2.8216, + "theoretical_loss": 3.517153095450696, + "tokens_seen": 1494822912 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027631895687061183, + "loss": 2.7673, + "theoretical_loss": 3.5171394911693623, + "tokens_seen": 1494888448 + }, + { + "epoch": 4.04, + "learning_rate": 0.000276308926780341, + "loss": 3.0934, + "theoretical_loss": 3.5171258876514147, + "tokens_seen": 1494953984 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002762988966900702, + "loss": 2.9807, + "theoretical_loss": 3.5171122848967777, + "tokens_seen": 1495019520 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027628886659979943, + "loss": 2.7752, + "theoretical_loss": 3.5170986829053748, + "tokens_seen": 1495085056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027627883650952856, + "loss": 2.7733, + "theoretical_loss": 3.5170850816771293, + "tokens_seen": 1495150592 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002762688064192578, + "loss": 2.7876, + "theoretical_loss": 3.5170714812119654, + "tokens_seen": 1495216128 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027625877632898697, + "loss": 2.684, + "theoretical_loss": 3.517057881509807, + "tokens_seen": 1495281664 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027624874623871615, + "loss": 2.7996, + "theoretical_loss": 3.517044282570577, + "tokens_seen": 1495347200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002762387161484454, + "loss": 2.6748, + "theoretical_loss": 3.5170306843942, + "tokens_seen": 1495412736 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002762286860581745, + "loss": 2.6214, + "theoretical_loss": 3.5170170869805997, + "tokens_seen": 1495478272 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027621865596790375, + "loss": 2.7577, + "theoretical_loss": 3.5170034903297, + "tokens_seen": 1495543808 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002762086258776329, + "loss": 2.7838, + "theoretical_loss": 3.5169898944414237, + "tokens_seen": 1495609344 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761985957873621, + "loss": 2.7372, + "theoretical_loss": 3.5169762993156954, + "tokens_seen": 1495674880 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761885656970913, + "loss": 2.7787, + "theoretical_loss": 3.5169627049524395, + "tokens_seen": 1495740416 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2387207, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9086039066314697, + "objective/train/theoretical_loss": 3.516949111351579, + "objective/train/tokens_used": 1516265952, + "theoretical_loss": 3.516949111351579, + "tokens_seen": 1495805952 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761785356068205, + "loss": 2.8808, + "theoretical_loss": 3.516949111351579, + "tokens_seen": 1495805952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027616850551654966, + "loss": 2.8882, + "theoretical_loss": 3.516935518513037, + "tokens_seen": 1495871488 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761584754262789, + "loss": 2.7332, + "theoretical_loss": 3.516921926436739, + "tokens_seen": 1495937024 + }, + { + "epoch": 4.04, + "learning_rate": 0.000276148445336008, + "loss": 2.6955, + "theoretical_loss": 3.516908335122608, + "tokens_seen": 1496002560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027613841524573725, + "loss": 2.8234, + "theoretical_loss": 3.5168947445705685, + "tokens_seen": 1496068096 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761283851554664, + "loss": 2.8588, + "theoretical_loss": 3.516881154780543, + "tokens_seen": 1496133632 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761183550651956, + "loss": 2.616, + "theoretical_loss": 3.516867565752457, + "tokens_seen": 1496199168 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002761083249749248, + "loss": 2.7341, + "theoretical_loss": 3.5168539774862335, + "tokens_seen": 1496264704 + }, + { + "epoch": 4.04, + "learning_rate": 0.000276098294884654, + "loss": 2.8449, + "theoretical_loss": 3.5168403899817964, + "tokens_seen": 1496330240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027608826479438316, + "loss": 2.8083, + "theoretical_loss": 3.51682680323907, + "tokens_seen": 1496395776 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027607823470411234, + "loss": 2.7892, + "theoretical_loss": 3.5168132172579782, + "tokens_seen": 1496461312 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002760682046138415, + "loss": 2.8082, + "theoretical_loss": 3.516799632038445, + "tokens_seen": 1496526848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027605817452357076, + "loss": 2.8837, + "theoretical_loss": 3.5167860475803936, + "tokens_seen": 1496592384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002760481444332999, + "loss": 2.7313, + "theoretical_loss": 3.5167724638837488, + "tokens_seen": 1496657920 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002760381143430291, + "loss": 2.6105, + "theoretical_loss": 3.516758880948435, + "tokens_seen": 1496723456 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027602808425275825, + "loss": 2.8334, + "theoretical_loss": 3.516745298774375, + "tokens_seen": 1496788992 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002760180541624875, + "loss": 2.9491, + "theoretical_loss": 3.5167317173614934, + "tokens_seen": 1496854528 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027600802407221666, + "loss": 2.8122, + "theoretical_loss": 3.5167181367097142, + "tokens_seen": 1496920064 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027599799398194584, + "loss": 2.85, + "theoretical_loss": 3.5167045568189614, + "tokens_seen": 1496985600 + }, + { + "epoch": 4.04, + "learning_rate": 0.000275987963891675, + "loss": 2.8866, + "theoretical_loss": 3.5166909776891595, + "tokens_seen": 1497051136 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027597793380140426, + "loss": 2.8, + "theoretical_loss": 3.5166773993202316, + "tokens_seen": 1497116672 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002759679037111334, + "loss": 2.8464, + "theoretical_loss": 3.5166638217121022, + "tokens_seen": 1497182208 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002759578736208626, + "loss": 2.8598, + "theoretical_loss": 3.516650244864696, + "tokens_seen": 1497247744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027594784353059175, + "loss": 2.8603, + "theoretical_loss": 3.5166366687779362, + "tokens_seen": 1497313280 + }, + { + "epoch": 4.04, + "learning_rate": 0.000275937813440321, + "loss": 2.8793, + "theoretical_loss": 3.5166230934517473, + "tokens_seen": 1497378816 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2390068, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.78629732131958, + "objective/train/theoretical_loss": 3.516609518886053, + "objective/train/tokens_used": 1517904352, + "theoretical_loss": 3.516609518886053, + "tokens_seen": 1497444352 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027592778335005017, + "loss": 2.825, + "theoretical_loss": 3.516609518886053, + "tokens_seen": 1497444352 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027591775325977935, + "loss": 2.7994, + "theoretical_loss": 3.516595945080778, + "tokens_seen": 1497509888 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027590772316950853, + "loss": 2.7642, + "theoretical_loss": 3.5165823720358462, + "tokens_seen": 1497575424 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758976930792377, + "loss": 2.7386, + "theoretical_loss": 3.516568799751182, + "tokens_seen": 1497640960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758876629889669, + "loss": 2.7879, + "theoretical_loss": 3.516555228226709, + "tokens_seen": 1497706496 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758776328986961, + "loss": 2.7711, + "theoretical_loss": 3.5165416574623514, + "tokens_seen": 1497772032 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027586760280842525, + "loss": 2.7945, + "theoretical_loss": 3.516528087458034, + "tokens_seen": 1497837568 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758575727181545, + "loss": 2.8413, + "theoretical_loss": 3.5165145182136808, + "tokens_seen": 1497903104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758475426278836, + "loss": 2.8966, + "theoretical_loss": 3.516500949729215, + "tokens_seen": 1497968640 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027583751253761285, + "loss": 2.8496, + "theoretical_loss": 3.516487382004562, + "tokens_seen": 1498034176 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027582748244734203, + "loss": 2.8436, + "theoretical_loss": 3.5164738150396455, + "tokens_seen": 1498099712 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758174523570712, + "loss": 2.8963, + "theoretical_loss": 3.51646024883439, + "tokens_seen": 1498165248 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002758074222668004, + "loss": 2.805, + "theoretical_loss": 3.516446683388719, + "tokens_seen": 1498230784 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027579739217652963, + "loss": 2.8117, + "theoretical_loss": 3.516433118702558, + "tokens_seen": 1498296320 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027578736208625876, + "loss": 2.7844, + "theoretical_loss": 3.5164195547758297, + "tokens_seen": 1498361856 + }, + { + "epoch": 4.04, + "learning_rate": 0.000275777331995988, + "loss": 2.8703, + "theoretical_loss": 3.51640599160846, + "tokens_seen": 1498427392 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002757673019057171, + "loss": 2.8645, + "theoretical_loss": 3.5163924292003723, + "tokens_seen": 1498492928 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027575727181544635, + "loss": 2.8515, + "theoretical_loss": 3.5163788675514907, + "tokens_seen": 1498558464 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027574724172517554, + "loss": 2.8265, + "theoretical_loss": 3.51636530666174, + "tokens_seen": 1498624000 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002757372116349047, + "loss": 2.8965, + "theoretical_loss": 3.5163517465310443, + "tokens_seen": 1498689536 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002757271815446339, + "loss": 2.8802, + "theoretical_loss": 3.5163381871593278, + "tokens_seen": 1498755072 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002757171514543631, + "loss": 2.6517, + "theoretical_loss": 3.516324628546515, + "tokens_seen": 1498820608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027570712136409226, + "loss": 2.7296, + "theoretical_loss": 3.51631107069253, + "tokens_seen": 1498886144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756970912738215, + "loss": 2.7701, + "theoretical_loss": 3.516297513597298, + "tokens_seen": 1498951680 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756870611835506, + "loss": 2.9065, + "theoretical_loss": 3.5162839572607423, + "tokens_seen": 1499017216 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2392821, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784412384033203, + "objective/train/theoretical_loss": 3.516270401682787, + "objective/train/tokens_used": 1519542752, + "theoretical_loss": 3.516270401682787, + "tokens_seen": 1499082752 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027567703109327986, + "loss": 2.8582, + "theoretical_loss": 3.516270401682787, + "tokens_seen": 1499082752 + }, + { + "epoch": 4.04, + "learning_rate": 0.000275667001003009, + "loss": 2.7454, + "theoretical_loss": 3.5162568468633584, + "tokens_seen": 1499148288 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756569709127382, + "loss": 2.8656, + "theoretical_loss": 3.516243292802379, + "tokens_seen": 1499213824 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756469408224674, + "loss": 2.8188, + "theoretical_loss": 3.5162297394997744, + "tokens_seen": 1499279360 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756369107321966, + "loss": 2.861, + "theoretical_loss": 3.516216186955468, + "tokens_seen": 1499344896 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027562688064192576, + "loss": 2.7807, + "theoretical_loss": 3.5162026351693854, + "tokens_seen": 1499410432 + }, + { + "epoch": 4.04, + "learning_rate": 0.000275616850551655, + "loss": 2.8452, + "theoretical_loss": 3.5161890841414496, + "tokens_seen": 1499475968 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002756068204613841, + "loss": 2.8758, + "theoretical_loss": 3.516175533871586, + "tokens_seen": 1499541504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027559679037111336, + "loss": 2.7622, + "theoretical_loss": 3.5161619843597194, + "tokens_seen": 1499607040 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002755867602808425, + "loss": 2.8336, + "theoretical_loss": 3.516148435605773, + "tokens_seen": 1499672576 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002755767301905717, + "loss": 2.8758, + "theoretical_loss": 3.516134887609673, + "tokens_seen": 1499738112 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002755667001003009, + "loss": 2.7585, + "theoretical_loss": 3.5161213403713427, + "tokens_seen": 1499803648 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002755566700100301, + "loss": 2.8567, + "theoretical_loss": 3.516107793890707, + "tokens_seen": 1499869184 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027554663991975927, + "loss": 2.806, + "theoretical_loss": 3.5160942481676893, + "tokens_seen": 1499934720 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027553660982948845, + "loss": 2.9481, + "theoretical_loss": 3.516080703202216, + "tokens_seen": 1500000256 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027552657973921763, + "loss": 2.8521, + "theoretical_loss": 3.5160671589942107, + "tokens_seen": 1500065792 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027551654964894686, + "loss": 2.762, + "theoretical_loss": 3.5160536155435977, + "tokens_seen": 1500131328 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027550651955867604, + "loss": 2.87, + "theoretical_loss": 3.516040072850302, + "tokens_seen": 1500196864 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002754964894684052, + "loss": 2.8778, + "theoretical_loss": 3.5160265309142487, + "tokens_seen": 1500262400 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027548645937813446, + "loss": 2.8656, + "theoretical_loss": 3.516012989735361, + "tokens_seen": 1500327936 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002754764292878636, + "loss": 2.7457, + "theoretical_loss": 3.515999449313564, + "tokens_seen": 1500393472 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002754663991975928, + "loss": 2.7479, + "theoretical_loss": 3.515985909648783, + "tokens_seen": 1500459008 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027545636910732195, + "loss": 2.8118, + "theoretical_loss": 3.515972370740942, + "tokens_seen": 1500524544 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002754463390170512, + "loss": 2.7031, + "theoretical_loss": 3.515958832589966, + "tokens_seen": 1500590080 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027543630892678037, + "loss": 3.0227, + "theoretical_loss": 3.5159452951957793, + "tokens_seen": 1500655616 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2394261, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.177739143371582, + "objective/train/theoretical_loss": 3.5159317585583065, + "objective/train/tokens_used": 1521181152, + "theoretical_loss": 3.5159317585583065, + "tokens_seen": 1500721152 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027542627883650955, + "loss": 2.8222, + "theoretical_loss": 3.5159317585583065, + "tokens_seen": 1500721152 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027541624874623873, + "loss": 2.7101, + "theoretical_loss": 3.515918222677472, + "tokens_seen": 1500786688 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002754062186559679, + "loss": 2.7329, + "theoretical_loss": 3.515904687553202, + "tokens_seen": 1500852224 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753961885656971, + "loss": 2.8429, + "theoretical_loss": 3.5158911531854193, + "tokens_seen": 1500917760 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753861584754263, + "loss": 2.8853, + "theoretical_loss": 3.5158776195740495, + "tokens_seen": 1500983296 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027537612838515545, + "loss": 2.8067, + "theoretical_loss": 3.5158640867190174, + "tokens_seen": 1501048832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753660982948847, + "loss": 2.9789, + "theoretical_loss": 3.515850554620247, + "tokens_seen": 1501114368 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753560682046138, + "loss": 2.6286, + "theoretical_loss": 3.5158370232776637, + "tokens_seen": 1501179904 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027534603811434305, + "loss": 2.8121, + "theoretical_loss": 3.515823492691192, + "tokens_seen": 1501245440 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027533600802407223, + "loss": 2.8866, + "theoretical_loss": 3.5158099628607564, + "tokens_seen": 1501310976 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753259779338014, + "loss": 2.7687, + "theoretical_loss": 3.5157964337862824, + "tokens_seen": 1501376512 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002753159478435306, + "loss": 2.8332, + "theoretical_loss": 3.5157829054676943, + "tokens_seen": 1501442048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027530591775325983, + "loss": 2.9669, + "theoretical_loss": 3.515769377904917, + "tokens_seen": 1501507584 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027529588766298896, + "loss": 2.8151, + "theoretical_loss": 3.5157558510978744, + "tokens_seen": 1501573120 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752858575727182, + "loss": 2.5919, + "theoretical_loss": 3.5157423250464923, + "tokens_seen": 1501638656 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752758274824473, + "loss": 2.8517, + "theoretical_loss": 3.515728799750695, + "tokens_seen": 1501704192 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027526579739217655, + "loss": 2.8828, + "theoretical_loss": 3.5157152752104084, + "tokens_seen": 1501769728 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027525576730190574, + "loss": 2.6389, + "theoretical_loss": 3.5157017514255564, + "tokens_seen": 1501835264 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752457372116349, + "loss": 2.8016, + "theoretical_loss": 3.5156882283960638, + "tokens_seen": 1501900800 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752357071213641, + "loss": 2.601, + "theoretical_loss": 3.515674706121855, + "tokens_seen": 1501966336 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752256770310933, + "loss": 2.8109, + "theoretical_loss": 3.5156611846028563, + "tokens_seen": 1502031872 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027521564694082246, + "loss": 2.8965, + "theoretical_loss": 3.5156476638389913, + "tokens_seen": 1502097408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002752056168505517, + "loss": 2.7276, + "theoretical_loss": 3.5156341438301855, + "tokens_seen": 1502162944 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751955867602808, + "loss": 2.7485, + "theoretical_loss": 3.5156206245763633, + "tokens_seen": 1502228480 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027518555667001006, + "loss": 2.8687, + "theoretical_loss": 3.51560710607745, + "tokens_seen": 1502294016 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2397293, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7634007930755615, + "objective/train/theoretical_loss": 3.515593588333371, + "objective/train/tokens_used": 1522819552, + "theoretical_loss": 3.515593588333371, + "tokens_seen": 1502359552 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751755265797392, + "loss": 2.7138, + "theoretical_loss": 3.515593588333371, + "tokens_seen": 1502359552 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751654964894684, + "loss": 2.8189, + "theoretical_loss": 3.5155800713440506, + "tokens_seen": 1502425088 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751554663991976, + "loss": 2.8487, + "theoretical_loss": 3.5155665551094133, + "tokens_seen": 1502490624 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751454363089268, + "loss": 2.8784, + "theoretical_loss": 3.515553039629385, + "tokens_seen": 1502556160 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027513540621865596, + "loss": 2.727, + "theoretical_loss": 3.5155395249038897, + "tokens_seen": 1502621696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751253761283852, + "loss": 2.8588, + "theoretical_loss": 3.515526010932853, + "tokens_seen": 1502687232 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002751153460381143, + "loss": 2.78, + "theoretical_loss": 3.5155124977162, + "tokens_seen": 1502752768 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027510531594784356, + "loss": 2.9342, + "theoretical_loss": 3.5154989852538554, + "tokens_seen": 1502818304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750952858575727, + "loss": 2.7646, + "theoretical_loss": 3.5154854735457444, + "tokens_seen": 1502883840 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750852557673019, + "loss": 2.7797, + "theoretical_loss": 3.515471962591792, + "tokens_seen": 1502949376 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750752256770311, + "loss": 2.7484, + "theoretical_loss": 3.515458452391923, + "tokens_seen": 1503014912 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750651955867603, + "loss": 2.7255, + "theoretical_loss": 3.515444942946062, + "tokens_seen": 1503080448 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027505516549648947, + "loss": 2.8089, + "theoretical_loss": 3.5154314342541353, + "tokens_seen": 1503145984 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027504513540621865, + "loss": 2.9587, + "theoretical_loss": 3.5154179263160668, + "tokens_seen": 1503211520 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027503510531594783, + "loss": 2.7614, + "theoretical_loss": 3.515404419131782, + "tokens_seen": 1503277056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027502507522567706, + "loss": 2.7874, + "theoretical_loss": 3.515390912701206, + "tokens_seen": 1503342592 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750150451354062, + "loss": 2.7195, + "theoretical_loss": 3.5153774070242645, + "tokens_seen": 1503408128 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002750050150451354, + "loss": 2.841, + "theoretical_loss": 3.5153639021008813, + "tokens_seen": 1503473664 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027499498495486455, + "loss": 2.8432, + "theoretical_loss": 3.5153503979309826, + "tokens_seen": 1503539200 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002749849548645938, + "loss": 2.8905, + "theoretical_loss": 3.5153368945144927, + "tokens_seen": 1503604736 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027497492477432297, + "loss": 2.8491, + "theoretical_loss": 3.5153233918513376, + "tokens_seen": 1503670272 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027496489468405215, + "loss": 2.9343, + "theoretical_loss": 3.5153098899414417, + "tokens_seen": 1503735808 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027495486459378133, + "loss": 2.7284, + "theoretical_loss": 3.5152963887847304, + "tokens_seen": 1503801344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027494483450351057, + "loss": 2.9268, + "theoretical_loss": 3.5152828883811287, + "tokens_seen": 1503866880 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002749348044132397, + "loss": 2.8042, + "theoretical_loss": 3.5152693887305624, + "tokens_seen": 1503932416 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2400761, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9576215744018555, + "objective/train/theoretical_loss": 3.515255889832956, + "objective/train/tokens_used": 1524457952, + "theoretical_loss": 3.515255889832956, + "tokens_seen": 1503997952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027492477432296893, + "loss": 2.9065, + "theoretical_loss": 3.515255889832956, + "tokens_seen": 1503997952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027491474423269806, + "loss": 2.8544, + "theoretical_loss": 3.5152423916882354, + "tokens_seen": 1504063488 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002749047141424273, + "loss": 2.8739, + "theoretical_loss": 3.5152288942963246, + "tokens_seen": 1504129024 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027489468405215647, + "loss": 2.6056, + "theoretical_loss": 3.5152153976571503, + "tokens_seen": 1504194560 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027488465396188565, + "loss": 2.7846, + "theoretical_loss": 3.515201901770636, + "tokens_seen": 1504260096 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027487462387161483, + "loss": 2.9501, + "theoretical_loss": 3.5151884066367094, + "tokens_seen": 1504325632 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274864593781344, + "loss": 2.7835, + "theoretical_loss": 3.5151749122552935, + "tokens_seen": 1504391168 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002748545636910732, + "loss": 2.8114, + "theoretical_loss": 3.515161418626314, + "tokens_seen": 1504456704 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027484453360080243, + "loss": 2.7229, + "theoretical_loss": 3.515147925749697, + "tokens_seen": 1504522240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027483450351053156, + "loss": 2.6572, + "theoretical_loss": 3.5151344336253674, + "tokens_seen": 1504587776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002748244734202608, + "loss": 2.7296, + "theoretical_loss": 3.51512094225325, + "tokens_seen": 1504653312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027481444332999, + "loss": 2.9232, + "theoretical_loss": 3.5151074516332708, + "tokens_seen": 1504718848 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027480441323971916, + "loss": 2.7526, + "theoretical_loss": 3.5150939617653547, + "tokens_seen": 1504784384 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027479438314944834, + "loss": 2.8205, + "theoretical_loss": 3.515080472649427, + "tokens_seen": 1504849920 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002747843530591775, + "loss": 2.8622, + "theoretical_loss": 3.515066984285413, + "tokens_seen": 1504915456 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027477432296890675, + "loss": 2.7786, + "theoretical_loss": 3.5150534966732385, + "tokens_seen": 1504980992 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027476429287863594, + "loss": 2.801, + "theoretical_loss": 3.5150400098128287, + "tokens_seen": 1505046528 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002747542627883651, + "loss": 2.8425, + "theoretical_loss": 3.5150265237041087, + "tokens_seen": 1505112064 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002747442326980943, + "loss": 2.8639, + "theoretical_loss": 3.5150130383470035, + "tokens_seen": 1505177600 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002747342026078235, + "loss": 2.9049, + "theoretical_loss": 3.51499955374144, + "tokens_seen": 1505243136 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027472417251755266, + "loss": 2.9919, + "theoretical_loss": 3.5149860698873416, + "tokens_seen": 1505308672 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002747141424272819, + "loss": 2.7323, + "theoretical_loss": 3.5149725867846353, + "tokens_seen": 1505374208 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274704112337011, + "loss": 2.706, + "theoretical_loss": 3.514959104433246, + "tokens_seen": 1505439744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027469408224674026, + "loss": 2.8348, + "theoretical_loss": 3.514945622833099, + "tokens_seen": 1505505280 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746840521564694, + "loss": 2.8984, + "theoretical_loss": 3.5149321419841195, + "tokens_seen": 1505570816 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2402245, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7521979808807373, + "objective/train/theoretical_loss": 3.5149186618862336, + "objective/train/tokens_used": 1526096352, + "theoretical_loss": 3.5149186618862336, + "tokens_seen": 1505636352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746740220661986, + "loss": 2.6714, + "theoretical_loss": 3.5149186618862336, + "tokens_seen": 1505636352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746639919759278, + "loss": 3.0262, + "theoretical_loss": 3.514905182539366, + "tokens_seen": 1505701888 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274653961885657, + "loss": 2.8237, + "theoretical_loss": 3.5148917039434426, + "tokens_seen": 1505767424 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027464393179538616, + "loss": 2.7552, + "theoretical_loss": 3.5148782260983893, + "tokens_seen": 1505832960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746339017051154, + "loss": 2.8029, + "theoretical_loss": 3.514864749004131, + "tokens_seen": 1505898496 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746238716148445, + "loss": 2.8473, + "theoretical_loss": 3.514851272660593, + "tokens_seen": 1505964032 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027461384152457376, + "loss": 2.8226, + "theoretical_loss": 3.5148377970677016, + "tokens_seen": 1506029568 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002746038114343029, + "loss": 2.7189, + "theoretical_loss": 3.514824322225382, + "tokens_seen": 1506095104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002745937813440321, + "loss": 2.8085, + "theoretical_loss": 3.5148108481335596, + "tokens_seen": 1506160640 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002745837512537613, + "loss": 2.8553, + "theoretical_loss": 3.5147973747921597, + "tokens_seen": 1506226176 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002745737211634905, + "loss": 2.9639, + "theoretical_loss": 3.514783902201108, + "tokens_seen": 1506291712 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027456369107321967, + "loss": 2.5748, + "theoretical_loss": 3.5147704303603304, + "tokens_seen": 1506357248 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027455366098294885, + "loss": 2.8442, + "theoretical_loss": 3.5147569592697527, + "tokens_seen": 1506422784 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027454363089267803, + "loss": 2.8621, + "theoretical_loss": 3.5147434889292994, + "tokens_seen": 1506488320 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027453360080240726, + "loss": 2.8387, + "theoretical_loss": 3.5147300193388973, + "tokens_seen": 1506553856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002745235707121364, + "loss": 2.8115, + "theoretical_loss": 3.514716550498471, + "tokens_seen": 1506619392 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002745135406218656, + "loss": 2.7497, + "theoretical_loss": 3.514703082407947, + "tokens_seen": 1506684928 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027450351053159475, + "loss": 2.885, + "theoretical_loss": 3.514689615067251, + "tokens_seen": 1506750464 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274493480441324, + "loss": 2.8, + "theoretical_loss": 3.5146761484763074, + "tokens_seen": 1506816000 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027448345035105317, + "loss": 2.8271, + "theoretical_loss": 3.514662682635043, + "tokens_seen": 1506881536 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027447342026078235, + "loss": 2.778, + "theoretical_loss": 3.5146492175433828, + "tokens_seen": 1506947072 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027446339017051153, + "loss": 2.7837, + "theoretical_loss": 3.5146357532012535, + "tokens_seen": 1507012608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027445336008024077, + "loss": 2.8996, + "theoretical_loss": 3.5146222896085795, + "tokens_seen": 1507078144 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002744433299899699, + "loss": 2.8392, + "theoretical_loss": 3.5146088267652873, + "tokens_seen": 1507143680 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027443329989969913, + "loss": 2.8888, + "theoretical_loss": 3.514595364671302, + "tokens_seen": 1507209216 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2404963, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.612988233566284, + "objective/train/theoretical_loss": 3.51458190332655, + "objective/train/tokens_used": 1527734752, + "theoretical_loss": 3.51458190332655, + "tokens_seen": 1507274752 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027442326980942826, + "loss": 2.8038, + "theoretical_loss": 3.51458190332655, + "tokens_seen": 1507274752 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002744132397191575, + "loss": 2.8043, + "theoretical_loss": 3.5145684427309565, + "tokens_seen": 1507340288 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027440320962888667, + "loss": 2.8002, + "theoretical_loss": 3.5145549828844476, + "tokens_seen": 1507405824 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027439317953861585, + "loss": 2.8318, + "theoretical_loss": 3.5145415237869493, + "tokens_seen": 1507471360 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027438314944834503, + "loss": 2.8305, + "theoretical_loss": 3.5145280654383866, + "tokens_seen": 1507536896 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002743731193580742, + "loss": 2.7254, + "theoretical_loss": 3.5145146078386853, + "tokens_seen": 1507602432 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002743630892678034, + "loss": 2.7037, + "theoretical_loss": 3.5145011509877717, + "tokens_seen": 1507667968 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027435305917753263, + "loss": 2.8544, + "theoretical_loss": 3.514487694885572, + "tokens_seen": 1507733504 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027434302908726176, + "loss": 2.9237, + "theoretical_loss": 3.5144742395320105, + "tokens_seen": 1507799040 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274332998996991, + "loss": 2.766, + "theoretical_loss": 3.5144607849270146, + "tokens_seen": 1507864576 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002743229689067202, + "loss": 2.9511, + "theoretical_loss": 3.5144473310705093, + "tokens_seen": 1507930112 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027431293881644936, + "loss": 2.8507, + "theoretical_loss": 3.5144338779624205, + "tokens_seen": 1507995648 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027430290872617854, + "loss": 2.9368, + "theoretical_loss": 3.5144204256026743, + "tokens_seen": 1508061184 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742928786359077, + "loss": 2.9524, + "theoretical_loss": 3.514406973991196, + "tokens_seen": 1508126720 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742828485456369, + "loss": 2.6397, + "theoretical_loss": 3.5143935231279118, + "tokens_seen": 1508192256 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027427281845536614, + "loss": 2.8578, + "theoretical_loss": 3.514380073012748, + "tokens_seen": 1508257792 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027426278836509526, + "loss": 2.8081, + "theoretical_loss": 3.51436662364563, + "tokens_seen": 1508323328 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742527582748245, + "loss": 2.8011, + "theoretical_loss": 3.514353175026484, + "tokens_seen": 1508388864 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742427281845536, + "loss": 2.8196, + "theoretical_loss": 3.5143397271552352, + "tokens_seen": 1508454400 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027423269809428286, + "loss": 2.8456, + "theoretical_loss": 3.5143262800318102, + "tokens_seen": 1508519936 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027422266800401204, + "loss": 3.0027, + "theoretical_loss": 3.5143128336561347, + "tokens_seen": 1508585472 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742126379137412, + "loss": 2.6373, + "theoretical_loss": 3.5142993880281352, + "tokens_seen": 1508651008 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002742026078234704, + "loss": 2.6981, + "theoretical_loss": 3.5142859431477365, + "tokens_seen": 1508716544 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002741925777331996, + "loss": 2.8494, + "theoretical_loss": 3.5142724990148655, + "tokens_seen": 1508782080 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027418254764292877, + "loss": 2.875, + "theoretical_loss": 3.5142590556294477, + "tokens_seen": 1508847616 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2407956, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.505647897720337, + "objective/train/theoretical_loss": 3.5142456129914095, + "objective/train/tokens_used": 1529373152, + "theoretical_loss": 3.5142456129914095, + "tokens_seen": 1508913152 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274172517552658, + "loss": 2.8707, + "theoretical_loss": 3.5142456129914095, + "tokens_seen": 1508913152 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027416248746238713, + "loss": 2.7779, + "theoretical_loss": 3.5142321711006765, + "tokens_seen": 1508978688 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027415245737211636, + "loss": 2.9615, + "theoretical_loss": 3.514218729957175, + "tokens_seen": 1509044224 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027414242728184554, + "loss": 2.5542, + "theoretical_loss": 3.514205289560831, + "tokens_seen": 1509109760 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002741323971915747, + "loss": 2.784, + "theoretical_loss": 3.51419184991157, + "tokens_seen": 1509175296 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002741223671013039, + "loss": 2.8123, + "theoretical_loss": 3.5141784110093184, + "tokens_seen": 1509240832 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002741123370110331, + "loss": 2.8499, + "theoretical_loss": 3.5141649728540028, + "tokens_seen": 1509306368 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027410230692076227, + "loss": 2.7957, + "theoretical_loss": 3.514151535445548, + "tokens_seen": 1509371904 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002740922768304915, + "loss": 2.9486, + "theoretical_loss": 3.5141380987838815, + "tokens_seen": 1509437440 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027408224674022063, + "loss": 2.6983, + "theoretical_loss": 3.5141246628689276, + "tokens_seen": 1509502976 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027407221664994987, + "loss": 2.9341, + "theoretical_loss": 3.5141112277006146, + "tokens_seen": 1509568512 + }, + { + "epoch": 4.04, + "learning_rate": 0.000274062186559679, + "loss": 2.6801, + "theoretical_loss": 3.514097793278867, + "tokens_seen": 1509634048 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027405215646940823, + "loss": 2.8085, + "theoretical_loss": 3.5140843596036113, + "tokens_seen": 1509699584 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002740421263791374, + "loss": 2.9272, + "theoretical_loss": 3.5140709266747736, + "tokens_seen": 1509765120 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002740320962888666, + "loss": 2.825, + "theoretical_loss": 3.51405749449228, + "tokens_seen": 1509830656 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002740220661985958, + "loss": 2.7552, + "theoretical_loss": 3.5140440630560574, + "tokens_seen": 1509896192 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027401203610832495, + "loss": 2.8448, + "theoretical_loss": 3.5140306323660306, + "tokens_seen": 1509961728 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002740020060180542, + "loss": 2.8179, + "theoretical_loss": 3.5140172024221266, + "tokens_seen": 1510027264 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027399197592778337, + "loss": 2.7721, + "theoretical_loss": 3.5140037732242715, + "tokens_seen": 1510092800 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027398194583751255, + "loss": 2.7635, + "theoretical_loss": 3.5139903447723917, + "tokens_seen": 1510158336 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027397191574724173, + "loss": 2.8042, + "theoretical_loss": 3.5139769170664126, + "tokens_seen": 1510223872 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027396188565697097, + "loss": 2.8227, + "theoretical_loss": 3.513963490106261, + "tokens_seen": 1510289408 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002739518555667001, + "loss": 2.7863, + "theoretical_loss": 3.513950063891863, + "tokens_seen": 1510354944 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027394182547642933, + "loss": 2.6279, + "theoretical_loss": 3.5139366384231447, + "tokens_seen": 1510420480 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027393179538615846, + "loss": 2.8089, + "theoretical_loss": 3.513923213700033, + "tokens_seen": 1510486016 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2410605, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.781374931335449, + "objective/train/theoretical_loss": 3.513909789722453, + "objective/train/tokens_used": 1531011552, + "theoretical_loss": 3.513909789722453, + "tokens_seen": 1510551552 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002739217652958877, + "loss": 2.8505, + "theoretical_loss": 3.513909789722453, + "tokens_seen": 1510551552 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027391173520561687, + "loss": 2.8087, + "theoretical_loss": 3.513896366490332, + "tokens_seen": 1510617088 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027390170511534605, + "loss": 2.7745, + "theoretical_loss": 3.513882944003596, + "tokens_seen": 1510682624 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027389167502507524, + "loss": 2.9546, + "theoretical_loss": 3.513869522262171, + "tokens_seen": 1510748160 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002738816449348044, + "loss": 2.7925, + "theoretical_loss": 3.513856101265983, + "tokens_seen": 1510813696 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002738716148445336, + "loss": 2.6494, + "theoretical_loss": 3.5138426810149586, + "tokens_seen": 1510879232 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027386158475426283, + "loss": 2.6904, + "theoretical_loss": 3.5138292615090245, + "tokens_seen": 1510944768 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027385155466399196, + "loss": 2.8552, + "theoretical_loss": 3.513815842748107, + "tokens_seen": 1511010304 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002738415245737212, + "loss": 2.8364, + "theoretical_loss": 3.5138024247321313, + "tokens_seen": 1511075840 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002738314944834504, + "loss": 2.7501, + "theoretical_loss": 3.5137890074610256, + "tokens_seen": 1511141376 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027382146439317956, + "loss": 2.8747, + "theoretical_loss": 3.5137755909347144, + "tokens_seen": 1511206912 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027381143430290874, + "loss": 2.9841, + "theoretical_loss": 3.5137621751531256, + "tokens_seen": 1511272448 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002738014042126379, + "loss": 2.8032, + "theoretical_loss": 3.5137487601161843, + "tokens_seen": 1511337984 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737913741223671, + "loss": 2.6767, + "theoretical_loss": 3.5137353458238176, + "tokens_seen": 1511403520 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027378134403209634, + "loss": 2.7822, + "theoretical_loss": 3.513721932275952, + "tokens_seen": 1511469056 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027377131394182546, + "loss": 2.7546, + "theoretical_loss": 3.5137085194725133, + "tokens_seen": 1511534592 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737612838515547, + "loss": 2.7974, + "theoretical_loss": 3.513695107413428, + "tokens_seen": 1511600128 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737512537612838, + "loss": 2.7839, + "theoretical_loss": 3.5136816960986232, + "tokens_seen": 1511665664 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027374122367101306, + "loss": 2.816, + "theoretical_loss": 3.513668285528025, + "tokens_seen": 1511731200 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027373119358074224, + "loss": 2.8466, + "theoretical_loss": 3.513654875701559, + "tokens_seen": 1511796736 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737211634904714, + "loss": 2.781, + "theoretical_loss": 3.5136414666191533, + "tokens_seen": 1511862272 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737111334002006, + "loss": 2.7414, + "theoretical_loss": 3.513628058280733, + "tokens_seen": 1511927808 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002737011033099298, + "loss": 2.7424, + "theoretical_loss": 3.5136146506862254, + "tokens_seen": 1511993344 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027369107321965897, + "loss": 2.8256, + "theoretical_loss": 3.513601243835556, + "tokens_seen": 1512058880 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002736810431293882, + "loss": 2.8617, + "theoretical_loss": 3.5135878377286525, + "tokens_seen": 1512124416 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2413370, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5713868141174316, + "objective/train/theoretical_loss": 3.5135744323654405, + "objective/train/tokens_used": 1532649952, + "theoretical_loss": 3.5135744323654405, + "tokens_seen": 1512189952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027367101303911733, + "loss": 2.9032, + "theoretical_loss": 3.5135744323654405, + "tokens_seen": 1512189952 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027366098294884656, + "loss": 2.9696, + "theoretical_loss": 3.513561027745847, + "tokens_seen": 1512255488 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027365095285857574, + "loss": 2.8218, + "theoretical_loss": 3.5135476238697985, + "tokens_seen": 1512321024 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002736409227683049, + "loss": 2.8068, + "theoretical_loss": 3.513534220737221, + "tokens_seen": 1512386560 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002736308926780341, + "loss": 2.7052, + "theoretical_loss": 3.5135208183480415, + "tokens_seen": 1512452096 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002736208625877633, + "loss": 2.7874, + "theoretical_loss": 3.513507416702187, + "tokens_seen": 1512517632 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027361083249749247, + "loss": 2.9326, + "theoretical_loss": 3.513494015799583, + "tokens_seen": 1512583168 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002736008024072217, + "loss": 2.778, + "theoretical_loss": 3.513480615640157, + "tokens_seen": 1512648704 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027359077231695083, + "loss": 2.9127, + "theoretical_loss": 3.513467216223835, + "tokens_seen": 1512714240 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027358074222668007, + "loss": 2.7839, + "theoretical_loss": 3.5134538175505443, + "tokens_seen": 1512779776 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002735707121364092, + "loss": 2.8231, + "theoretical_loss": 3.5134404196202107, + "tokens_seen": 1512845312 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027356068204613843, + "loss": 2.8196, + "theoretical_loss": 3.5134270224327615, + "tokens_seen": 1512910848 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002735506519558676, + "loss": 2.654, + "theoretical_loss": 3.5134136259881226, + "tokens_seen": 1512976384 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002735406218655968, + "loss": 2.9949, + "theoretical_loss": 3.513400230286221, + "tokens_seen": 1513041920 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027353059177532597, + "loss": 2.6463, + "theoretical_loss": 3.5133868353269837, + "tokens_seen": 1513107456 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027352056168505515, + "loss": 2.7836, + "theoretical_loss": 3.5133734411103372, + "tokens_seen": 1513172992 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027351053159478433, + "loss": 2.8089, + "theoretical_loss": 3.5133600476362075, + "tokens_seen": 1513238528 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027350050150451357, + "loss": 2.7845, + "theoretical_loss": 3.513346654904522, + "tokens_seen": 1513304064 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734904714142427, + "loss": 2.7828, + "theoretical_loss": 3.5133332629152076, + "tokens_seen": 1513369600 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027348044132397193, + "loss": 2.611, + "theoretical_loss": 3.51331987166819, + "tokens_seen": 1513435136 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734704112337011, + "loss": 3.0313, + "theoretical_loss": 3.513306481163397, + "tokens_seen": 1513500672 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734603811434303, + "loss": 2.7804, + "theoretical_loss": 3.513293091400755, + "tokens_seen": 1513566208 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734503510531595, + "loss": 2.8973, + "theoretical_loss": 3.5132797023801903, + "tokens_seen": 1513631744 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027344032096288866, + "loss": 2.8706, + "theoretical_loss": 3.51326631410163, + "tokens_seen": 1513697280 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027343029087261784, + "loss": 2.6932, + "theoretical_loss": 3.513252926565001, + "tokens_seen": 1513762816 + }, + { + "epoch": 4.04, + "objective/train/docs_used": 2416326, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6669678688049316, + "objective/train/theoretical_loss": 3.51323953977023, + "objective/train/tokens_used": 1534288352, + "theoretical_loss": 3.51323953977023, + "tokens_seen": 1513828352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734202607823471, + "loss": 2.7801, + "theoretical_loss": 3.51323953977023, + "tokens_seen": 1513828352 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002734102306920762, + "loss": 2.806, + "theoretical_loss": 3.513226153717243, + "tokens_seen": 1513893888 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027340020060180544, + "loss": 2.8852, + "theoretical_loss": 3.5132127684059675, + "tokens_seen": 1513959424 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027339017051153456, + "loss": 2.8512, + "theoretical_loss": 3.513199383836331, + "tokens_seen": 1514024960 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002733801404212638, + "loss": 2.7905, + "theoretical_loss": 3.513186000008259, + "tokens_seen": 1514090496 + }, + { + "epoch": 4.04, + "learning_rate": 0.000273370110330993, + "loss": 2.8397, + "theoretical_loss": 3.5131726169216786, + "tokens_seen": 1514156032 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027336008024072216, + "loss": 2.9109, + "theoretical_loss": 3.5131592345765172, + "tokens_seen": 1514221568 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027335005015045134, + "loss": 2.7027, + "theoretical_loss": 3.513145852972701, + "tokens_seen": 1514287104 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002733400200601806, + "loss": 2.763, + "theoretical_loss": 3.5131324721101578, + "tokens_seen": 1514352640 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002733299899699097, + "loss": 2.7657, + "theoretical_loss": 3.5131190919888136, + "tokens_seen": 1514418176 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027331995987963894, + "loss": 2.8176, + "theoretical_loss": 3.513105712608595, + "tokens_seen": 1514483712 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027330992978936807, + "loss": 2.818, + "theoretical_loss": 3.51309233396943, + "tokens_seen": 1514549248 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002732998996990973, + "loss": 2.6988, + "theoretical_loss": 3.5130789560712445, + "tokens_seen": 1514614784 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002732898696088265, + "loss": 2.6556, + "theoretical_loss": 3.513065578913966, + "tokens_seen": 1514680320 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027327983951855566, + "loss": 2.8301, + "theoretical_loss": 3.5130522024975215, + "tokens_seen": 1514745856 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002732698094282849, + "loss": 2.9753, + "theoretical_loss": 3.5130388268218375, + "tokens_seen": 1514811392 + }, + { + "epoch": 4.04, + "learning_rate": 0.000273259779338014, + "loss": 2.8309, + "theoretical_loss": 3.5130254518868407, + "tokens_seen": 1514876928 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027324974924774326, + "loss": 2.6889, + "theoretical_loss": 3.5130120776924585, + "tokens_seen": 1514942464 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027323971915747244, + "loss": 2.6985, + "theoretical_loss": 3.5129987042386173, + "tokens_seen": 1515008000 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002732296890672016, + "loss": 2.776, + "theoretical_loss": 3.512985331525245, + "tokens_seen": 1515073536 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002732196589769308, + "loss": 2.9445, + "theoretical_loss": 3.512971959552268, + "tokens_seen": 1515139072 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027320962888666, + "loss": 2.8661, + "theoretical_loss": 3.5129585883196137, + "tokens_seen": 1515204608 + }, + { + "epoch": 4.04, + "learning_rate": 0.00027319959879638917, + "loss": 2.6918, + "theoretical_loss": 3.512945217827208, + "tokens_seen": 1515270144 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002731895687061184, + "loss": 2.7159, + "theoretical_loss": 3.512931848074979, + "tokens_seen": 1515335680 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027317953861584753, + "loss": 2.6816, + "theoretical_loss": 3.5129184790628543, + "tokens_seen": 1515401216 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2417812, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1235430240631104, + "objective/train/theoretical_loss": 3.5129051107907587, + "objective/train/tokens_used": 1535926752, + "theoretical_loss": 3.5129051107907587, + "tokens_seen": 1515466752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027316950852557676, + "loss": 2.9433, + "theoretical_loss": 3.5129051107907587, + "tokens_seen": 1515466752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027315947843530594, + "loss": 2.7704, + "theoretical_loss": 3.5128917432586215, + "tokens_seen": 1515532288 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002731494483450351, + "loss": 2.7049, + "theoretical_loss": 3.512878376466368, + "tokens_seen": 1515597824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002731394182547643, + "loss": 2.8149, + "theoretical_loss": 3.5128650104139263, + "tokens_seen": 1515663360 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002731293881644935, + "loss": 2.7439, + "theoretical_loss": 3.5128516451012235, + "tokens_seen": 1515728896 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027311935807422267, + "loss": 2.8652, + "theoretical_loss": 3.5128382805281864, + "tokens_seen": 1515794432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002731093279839519, + "loss": 2.8761, + "theoretical_loss": 3.5128249166947416, + "tokens_seen": 1515859968 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027309929789368103, + "loss": 2.7396, + "theoretical_loss": 3.5128115536008173, + "tokens_seen": 1515925504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027308926780341027, + "loss": 2.7843, + "theoretical_loss": 3.5127981912463397, + "tokens_seen": 1515991040 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002730792377131394, + "loss": 2.7295, + "theoretical_loss": 3.512784829631236, + "tokens_seen": 1516056576 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027306920762286863, + "loss": 2.8069, + "theoretical_loss": 3.512771468755434, + "tokens_seen": 1516122112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002730591775325978, + "loss": 2.755, + "theoretical_loss": 3.5127581086188595, + "tokens_seen": 1516187648 + }, + { + "epoch": 4.05, + "learning_rate": 0.000273049147442327, + "loss": 2.7612, + "theoretical_loss": 3.5127447492214414, + "tokens_seen": 1516253184 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027303911735205617, + "loss": 2.7294, + "theoretical_loss": 3.512731390563106, + "tokens_seen": 1516318720 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027302908726178535, + "loss": 2.7795, + "theoretical_loss": 3.5127180326437797, + "tokens_seen": 1516384256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027301905717151453, + "loss": 2.9348, + "theoretical_loss": 3.5127046754633913, + "tokens_seen": 1516449792 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027300902708124377, + "loss": 2.7213, + "theoretical_loss": 3.512691319021867, + "tokens_seen": 1516515328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729989969909729, + "loss": 2.7755, + "theoretical_loss": 3.512677963319134, + "tokens_seen": 1516580864 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027298896690070213, + "loss": 2.8612, + "theoretical_loss": 3.512664608355119, + "tokens_seen": 1516646400 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729789368104313, + "loss": 2.6267, + "theoretical_loss": 3.5126512541297505, + "tokens_seen": 1516711936 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729689067201605, + "loss": 2.7992, + "theoretical_loss": 3.512637900642955, + "tokens_seen": 1516777472 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729588766298897, + "loss": 2.8228, + "theoretical_loss": 3.51262454789466, + "tokens_seen": 1516843008 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027294884653961886, + "loss": 2.8949, + "theoretical_loss": 3.5126111958847925, + "tokens_seen": 1516908544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027293881644934804, + "loss": 2.8696, + "theoretical_loss": 3.5125978446132797, + "tokens_seen": 1516974080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729287863590773, + "loss": 2.8869, + "theoretical_loss": 3.5125844940800492, + "tokens_seen": 1517039616 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2420464, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8062939643859863, + "objective/train/theoretical_loss": 3.512571144285028, + "objective/train/tokens_used": 1537565152, + "theoretical_loss": 3.512571144285028, + "tokens_seen": 1517105152 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002729187562688064, + "loss": 2.6589, + "theoretical_loss": 3.512571144285028, + "tokens_seen": 1517105152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027290872617853564, + "loss": 2.8056, + "theoretical_loss": 3.5125577952281435, + "tokens_seen": 1517170688 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027289869608826476, + "loss": 2.7532, + "theoretical_loss": 3.512544446909323, + "tokens_seen": 1517236224 + }, + { + "epoch": 4.05, + "learning_rate": 0.000272888665997994, + "loss": 2.9224, + "theoretical_loss": 3.512531099328494, + "tokens_seen": 1517301760 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002728786359077232, + "loss": 2.8535, + "theoretical_loss": 3.512517752485584, + "tokens_seen": 1517367296 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027286860581745236, + "loss": 2.8782, + "theoretical_loss": 3.5125044063805193, + "tokens_seen": 1517432832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027285857572718154, + "loss": 2.8137, + "theoretical_loss": 3.5124910610132285, + "tokens_seen": 1517498368 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002728485456369108, + "loss": 2.8311, + "theoretical_loss": 3.512477716383638, + "tokens_seen": 1517563904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002728385155466399, + "loss": 2.6566, + "theoretical_loss": 3.5124643724916753, + "tokens_seen": 1517629440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027282848545636914, + "loss": 2.6619, + "theoretical_loss": 3.512451029337269, + "tokens_seen": 1517694976 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027281845536609827, + "loss": 2.9203, + "theoretical_loss": 3.512437686920345, + "tokens_seen": 1517760512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002728084252758275, + "loss": 2.5574, + "theoretical_loss": 3.512424345240831, + "tokens_seen": 1517826048 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002727983951855567, + "loss": 2.8238, + "theoretical_loss": 3.512411004298655, + "tokens_seen": 1517891584 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027278836509528586, + "loss": 2.8616, + "theoretical_loss": 3.5123976640937435, + "tokens_seen": 1517957120 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027277833500501504, + "loss": 2.67, + "theoretical_loss": 3.512384324626025, + "tokens_seen": 1518022656 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002727683049147442, + "loss": 2.8896, + "theoretical_loss": 3.5123709858954264, + "tokens_seen": 1518088192 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002727582748244734, + "loss": 2.9505, + "theoretical_loss": 3.512357647901875, + "tokens_seen": 1518153728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027274824473420264, + "loss": 2.7291, + "theoretical_loss": 3.5123443106452985, + "tokens_seen": 1518219264 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027273821464393177, + "loss": 2.7205, + "theoretical_loss": 3.5123309741256246, + "tokens_seen": 1518284800 + }, + { + "epoch": 4.05, + "learning_rate": 0.000272728184553661, + "loss": 2.839, + "theoretical_loss": 3.51231763834278, + "tokens_seen": 1518350336 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027271815446339013, + "loss": 2.7407, + "theoretical_loss": 3.512304303296693, + "tokens_seen": 1518415872 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027270812437311937, + "loss": 2.6824, + "theoretical_loss": 3.5122909689872905, + "tokens_seen": 1518481408 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027269809428284855, + "loss": 2.6617, + "theoretical_loss": 3.5122776354145007, + "tokens_seen": 1518546944 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027268806419257773, + "loss": 2.7063, + "theoretical_loss": 3.51226430257825, + "tokens_seen": 1518612480 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002726780341023069, + "loss": 2.8918, + "theoretical_loss": 3.5122509704784672, + "tokens_seen": 1518678016 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2423290, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9472241401672363, + "objective/train/theoretical_loss": 3.512237639115079, + "objective/train/tokens_used": 1539203552, + "theoretical_loss": 3.512237639115079, + "tokens_seen": 1518743552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027266800401203614, + "loss": 2.7778, + "theoretical_loss": 3.512237639115079, + "tokens_seen": 1518743552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027265797392176527, + "loss": 2.8747, + "theoretical_loss": 3.5122243084880136, + "tokens_seen": 1518809088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002726479438314945, + "loss": 2.6769, + "theoretical_loss": 3.5122109785971976, + "tokens_seen": 1518874624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027263791374122363, + "loss": 2.8796, + "theoretical_loss": 3.5121976494425593, + "tokens_seen": 1518940160 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027262788365095287, + "loss": 2.8952, + "theoretical_loss": 3.5121843210240264, + "tokens_seen": 1519005696 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027261785356068205, + "loss": 2.7803, + "theoretical_loss": 3.512170993341526, + "tokens_seen": 1519071232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027260782347041123, + "loss": 2.9078, + "theoretical_loss": 3.5121576663949865, + "tokens_seen": 1519136768 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002725977933801404, + "loss": 2.9274, + "theoretical_loss": 3.512144340184334, + "tokens_seen": 1519202304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002725877632898696, + "loss": 2.804, + "theoretical_loss": 3.5121310147094977, + "tokens_seen": 1519267840 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002725777331995988, + "loss": 2.8615, + "theoretical_loss": 3.5121176899704047, + "tokens_seen": 1519333376 + }, + { + "epoch": 4.05, + "learning_rate": 0.000272567703109328, + "loss": 2.8164, + "theoretical_loss": 3.512104365966982, + "tokens_seen": 1519398912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027255767301905714, + "loss": 2.8611, + "theoretical_loss": 3.5120910426991583, + "tokens_seen": 1519464448 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027254764292878637, + "loss": 2.8045, + "theoretical_loss": 3.512077720166861, + "tokens_seen": 1519529984 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002725376128385155, + "loss": 2.7861, + "theoretical_loss": 3.5120643983700166, + "tokens_seen": 1519595520 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027252758274824473, + "loss": 2.7388, + "theoretical_loss": 3.5120510773085547, + "tokens_seen": 1519661056 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027251755265797397, + "loss": 2.7896, + "theoretical_loss": 3.5120377569824015, + "tokens_seen": 1519726592 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002725075225677031, + "loss": 2.9024, + "theoretical_loss": 3.512024437391485, + "tokens_seen": 1519792128 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027249749247743233, + "loss": 2.8814, + "theoretical_loss": 3.5120111185357334, + "tokens_seen": 1519857664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002724874623871615, + "loss": 2.7024, + "theoretical_loss": 3.5119978004150747, + "tokens_seen": 1519923200 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002724774322968907, + "loss": 2.6898, + "theoretical_loss": 3.511984483029435, + "tokens_seen": 1519988736 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002724674022066199, + "loss": 2.7788, + "theoretical_loss": 3.511971166378744, + "tokens_seen": 1520054272 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027245737211634906, + "loss": 2.8612, + "theoretical_loss": 3.5119578504629287, + "tokens_seen": 1520119808 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027244734202607824, + "loss": 2.8949, + "theoretical_loss": 3.5119445352819163, + "tokens_seen": 1520185344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002724373119358075, + "loss": 2.8848, + "theoretical_loss": 3.5119312208356352, + "tokens_seen": 1520250880 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002724272818455366, + "loss": 2.9212, + "theoretical_loss": 3.5119179071240127, + "tokens_seen": 1520316416 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2426119, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.816171169281006, + "objective/train/theoretical_loss": 3.5119045941469778, + "objective/train/tokens_used": 1540841952, + "theoretical_loss": 3.5119045941469778, + "tokens_seen": 1520381952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027241725175526584, + "loss": 2.9407, + "theoretical_loss": 3.5119045941469778, + "tokens_seen": 1520381952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027240722166499496, + "loss": 2.7175, + "theoretical_loss": 3.5118912819044565, + "tokens_seen": 1520447488 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002723971915747242, + "loss": 2.8901, + "theoretical_loss": 3.5118779703963785, + "tokens_seen": 1520513024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002723871614844534, + "loss": 2.7458, + "theoretical_loss": 3.5118646596226695, + "tokens_seen": 1520578560 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027237713139418256, + "loss": 2.8579, + "theoretical_loss": 3.5118513495832593, + "tokens_seen": 1520644096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027236710130391174, + "loss": 2.7353, + "theoretical_loss": 3.511838040278075, + "tokens_seen": 1520709632 + }, + { + "epoch": 4.05, + "learning_rate": 0.000272357071213641, + "loss": 2.9193, + "theoretical_loss": 3.511824731707044, + "tokens_seen": 1520775168 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002723470411233701, + "loss": 2.7797, + "theoretical_loss": 3.5118114238700953, + "tokens_seen": 1520840704 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027233701103309934, + "loss": 2.7825, + "theoretical_loss": 3.5117981167671557, + "tokens_seen": 1520906240 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027232698094282847, + "loss": 2.823, + "theoretical_loss": 3.5117848103981535, + "tokens_seen": 1520971776 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002723169508525577, + "loss": 2.8126, + "theoretical_loss": 3.5117715047630162, + "tokens_seen": 1521037312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002723069207622869, + "loss": 2.8638, + "theoretical_loss": 3.5117581998616725, + "tokens_seen": 1521102848 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027229689067201606, + "loss": 2.8865, + "theoretical_loss": 3.51174489569405, + "tokens_seen": 1521168384 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027228686058174524, + "loss": 2.9199, + "theoretical_loss": 3.511731592260076, + "tokens_seen": 1521233920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002722768304914744, + "loss": 2.9926, + "theoretical_loss": 3.511718289559679, + "tokens_seen": 1521299456 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002722668004012036, + "loss": 2.8906, + "theoretical_loss": 3.511704987592787, + "tokens_seen": 1521364992 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027225677031093284, + "loss": 2.8111, + "theoretical_loss": 3.5116916863593284, + "tokens_seen": 1521430528 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027224674022066197, + "loss": 2.7845, + "theoretical_loss": 3.5116783858592298, + "tokens_seen": 1521496064 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002722367101303912, + "loss": 2.7464, + "theoretical_loss": 3.511665086092421, + "tokens_seen": 1521561600 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027222668004012033, + "loss": 2.9586, + "theoretical_loss": 3.5116517870588284, + "tokens_seen": 1521627136 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027221664994984957, + "loss": 2.8486, + "theoretical_loss": 3.511638488758381, + "tokens_seen": 1521692672 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027220661985957875, + "loss": 2.879, + "theoretical_loss": 3.5116251911910057, + "tokens_seen": 1521758208 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027219658976930793, + "loss": 2.8083, + "theoretical_loss": 3.5116118943566317, + "tokens_seen": 1521823744 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002721865596790371, + "loss": 2.7321, + "theoretical_loss": 3.511598598255187, + "tokens_seen": 1521889280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027217652958876635, + "loss": 2.7696, + "theoretical_loss": 3.5115853028865986, + "tokens_seen": 1521954816 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2428664, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.947704315185547, + "objective/train/theoretical_loss": 3.511572008250795, + "objective/train/tokens_used": 1542480352, + "theoretical_loss": 3.511572008250795, + "tokens_seen": 1522020352 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027216649949849547, + "loss": 2.7845, + "theoretical_loss": 3.511572008250795, + "tokens_seen": 1522020352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002721564694082247, + "loss": 2.808, + "theoretical_loss": 3.511558714347705, + "tokens_seen": 1522085888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027214643931795383, + "loss": 2.7596, + "theoretical_loss": 3.5115454211772557, + "tokens_seen": 1522151424 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027213640922768307, + "loss": 2.8114, + "theoretical_loss": 3.511532128739376, + "tokens_seen": 1522216960 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027212637913741225, + "loss": 2.8226, + "theoretical_loss": 3.5115188370339934, + "tokens_seen": 1522282496 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027211634904714143, + "loss": 2.8403, + "theoretical_loss": 3.511505546061036, + "tokens_seen": 1522348032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002721063189568706, + "loss": 2.8085, + "theoretical_loss": 3.511492255820432, + "tokens_seen": 1522413568 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720962888665998, + "loss": 2.8279, + "theoretical_loss": 3.5114789663121098, + "tokens_seen": 1522479104 + }, + { + "epoch": 4.05, + "learning_rate": 0.000272086258776329, + "loss": 2.8859, + "theoretical_loss": 3.511465677535998, + "tokens_seen": 1522544640 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720762286860582, + "loss": 2.808, + "theoretical_loss": 3.511452389492023, + "tokens_seen": 1522610176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027206619859578734, + "loss": 2.808, + "theoretical_loss": 3.511439102180115, + "tokens_seen": 1522675712 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027205616850551657, + "loss": 2.8369, + "theoretical_loss": 3.5114258156002007, + "tokens_seen": 1522741248 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720461384152457, + "loss": 2.6909, + "theoretical_loss": 3.5114125297522087, + "tokens_seen": 1522806784 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027203610832497493, + "loss": 2.8697, + "theoretical_loss": 3.511399244636068, + "tokens_seen": 1522872320 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720260782347041, + "loss": 2.81, + "theoretical_loss": 3.511385960251705, + "tokens_seen": 1522937856 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720160481444333, + "loss": 2.9485, + "theoretical_loss": 3.5113726765990494, + "tokens_seen": 1523003392 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002720060180541625, + "loss": 2.7376, + "theoretical_loss": 3.511359393678029, + "tokens_seen": 1523068928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002719959879638917, + "loss": 2.8931, + "theoretical_loss": 3.511346111488572, + "tokens_seen": 1523134464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027198595787362084, + "loss": 2.7225, + "theoretical_loss": 3.5113328300306064, + "tokens_seen": 1523200000 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002719759277833501, + "loss": 2.6874, + "theoretical_loss": 3.511319549304061, + "tokens_seen": 1523265536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002719658976930792, + "loss": 2.8479, + "theoretical_loss": 3.5113062693088644, + "tokens_seen": 1523331072 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027195586760280844, + "loss": 2.7572, + "theoretical_loss": 3.511292990044943, + "tokens_seen": 1523396608 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002719458375125376, + "loss": 2.7585, + "theoretical_loss": 3.511279711512227, + "tokens_seen": 1523462144 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002719358074222668, + "loss": 2.7222, + "theoretical_loss": 3.5112664337106434, + "tokens_seen": 1523527680 + }, + { + "epoch": 4.05, + "learning_rate": 0.000271925777331996, + "loss": 2.7891, + "theoretical_loss": 3.5112531566401217, + "tokens_seen": 1523593216 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2431550, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7729578018188477, + "objective/train/theoretical_loss": 3.5112398803005895, + "objective/train/tokens_used": 1544118752, + "theoretical_loss": 3.5112398803005895, + "tokens_seen": 1523658752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027191574724172516, + "loss": 2.7481, + "theoretical_loss": 3.5112398803005895, + "tokens_seen": 1523658752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027190571715145434, + "loss": 2.8661, + "theoretical_loss": 3.5112266046919745, + "tokens_seen": 1523724288 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718956870611836, + "loss": 2.8355, + "theoretical_loss": 3.5112133298142068, + "tokens_seen": 1523789824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718856569709127, + "loss": 2.739, + "theoretical_loss": 3.5112000556672127, + "tokens_seen": 1523855360 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027187562688064194, + "loss": 2.8497, + "theoretical_loss": 3.5111867822509217, + "tokens_seen": 1523920896 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027186559679037107, + "loss": 2.7644, + "theoretical_loss": 3.511173509565262, + "tokens_seen": 1523986432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718555667001003, + "loss": 2.8987, + "theoretical_loss": 3.511160237610162, + "tokens_seen": 1524051968 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718455366098295, + "loss": 2.7201, + "theoretical_loss": 3.51114696638555, + "tokens_seen": 1524117504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027183550651955867, + "loss": 2.7878, + "theoretical_loss": 3.5111336958913544, + "tokens_seen": 1524183040 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027182547642928785, + "loss": 2.8867, + "theoretical_loss": 3.511120426127503, + "tokens_seen": 1524248576 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718154463390171, + "loss": 2.7948, + "theoretical_loss": 3.5111071570939254, + "tokens_seen": 1524314112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002718054162487462, + "loss": 2.8951, + "theoretical_loss": 3.5110938887905494, + "tokens_seen": 1524379648 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027179538615847544, + "loss": 2.7276, + "theoretical_loss": 3.511080621217303, + "tokens_seen": 1524445184 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027178535606820457, + "loss": 2.7125, + "theoretical_loss": 3.5110673543741155, + "tokens_seen": 1524510720 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002717753259779338, + "loss": 2.7419, + "theoretical_loss": 3.5110540882609147, + "tokens_seen": 1524576256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027176529588766304, + "loss": 2.7941, + "theoretical_loss": 3.511040822877629, + "tokens_seen": 1524641792 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027175526579739217, + "loss": 2.7689, + "theoretical_loss": 3.511027558224187, + "tokens_seen": 1524707328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002717452357071214, + "loss": 2.8249, + "theoretical_loss": 3.511014294300518, + "tokens_seen": 1524772864 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027173520561685053, + "loss": 2.6622, + "theoretical_loss": 3.5110010311065496, + "tokens_seen": 1524838400 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027172517552657977, + "loss": 2.8201, + "theoretical_loss": 3.5109877686422104, + "tokens_seen": 1524903936 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027171514543630895, + "loss": 2.7532, + "theoretical_loss": 3.510974506907429, + "tokens_seen": 1524969472 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027170511534603813, + "loss": 2.8739, + "theoretical_loss": 3.5109612459021333, + "tokens_seen": 1525035008 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002716950852557673, + "loss": 2.7334, + "theoretical_loss": 3.5109479856262533, + "tokens_seen": 1525100544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027168505516549655, + "loss": 2.8581, + "theoretical_loss": 3.510934726079716, + "tokens_seen": 1525166080 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027167502507522567, + "loss": 2.7862, + "theoretical_loss": 3.510921467262451, + "tokens_seen": 1525231616 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2434422, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7701640129089355, + "objective/train/theoretical_loss": 3.510908209174387, + "objective/train/tokens_used": 1545757152, + "theoretical_loss": 3.510908209174387, + "tokens_seen": 1525297152 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002716649949849549, + "loss": 2.8172, + "theoretical_loss": 3.510908209174387, + "tokens_seen": 1525297152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027165496489468403, + "loss": 2.9113, + "theoretical_loss": 3.510894951815451, + "tokens_seen": 1525362688 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027164493480441327, + "loss": 2.8396, + "theoretical_loss": 3.510881695185573, + "tokens_seen": 1525428224 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027163490471414245, + "loss": 2.7707, + "theoretical_loss": 3.5108684392846814, + "tokens_seen": 1525493760 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027162487462387163, + "loss": 2.7478, + "theoretical_loss": 3.5108551841127045, + "tokens_seen": 1525559296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002716148445336008, + "loss": 2.6596, + "theoretical_loss": 3.5108419296695708, + "tokens_seen": 1525624832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027160481444333, + "loss": 2.8201, + "theoretical_loss": 3.5108286759552096, + "tokens_seen": 1525690368 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715947843530592, + "loss": 2.736, + "theoretical_loss": 3.5108154229695487, + "tokens_seen": 1525755904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715847542627884, + "loss": 2.9388, + "theoretical_loss": 3.510802170712517, + "tokens_seen": 1525821440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027157472417251754, + "loss": 2.8535, + "theoretical_loss": 3.510788919184044, + "tokens_seen": 1525886976 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715646940822468, + "loss": 2.8415, + "theoretical_loss": 3.5107756683840567, + "tokens_seen": 1525952512 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715546639919759, + "loss": 2.7491, + "theoretical_loss": 3.510762418312485, + "tokens_seen": 1526018048 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027154463390170514, + "loss": 2.7431, + "theoretical_loss": 3.5107491689692574, + "tokens_seen": 1526083584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715346038114343, + "loss": 2.9299, + "theoretical_loss": 3.5107359203543025, + "tokens_seen": 1526149120 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715245737211635, + "loss": 2.7936, + "theoretical_loss": 3.5107226724675487, + "tokens_seen": 1526214656 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715145436308927, + "loss": 2.7648, + "theoretical_loss": 3.510709425308925, + "tokens_seen": 1526280192 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002715045135406219, + "loss": 2.814, + "theoretical_loss": 3.51069617887836, + "tokens_seen": 1526345728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027149448345035104, + "loss": 2.8994, + "theoretical_loss": 3.5106829331757825, + "tokens_seen": 1526411264 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002714844533600803, + "loss": 2.8511, + "theoretical_loss": 3.510669688201121, + "tokens_seen": 1526476800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002714744232698094, + "loss": 2.8339, + "theoretical_loss": 3.5106564439543053, + "tokens_seen": 1526542336 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027146439317953864, + "loss": 2.7835, + "theoretical_loss": 3.510643200435263, + "tokens_seen": 1526607872 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002714543630892678, + "loss": 2.7297, + "theoretical_loss": 3.5106299576439226, + "tokens_seen": 1526673408 + }, + { + "epoch": 4.05, + "learning_rate": 0.000271444332998997, + "loss": 2.7915, + "theoretical_loss": 3.510616715580214, + "tokens_seen": 1526738944 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002714343029087262, + "loss": 2.8449, + "theoretical_loss": 3.510603474244065, + "tokens_seen": 1526804480 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027142427281845536, + "loss": 2.6998, + "theoretical_loss": 3.510590233635406, + "tokens_seen": 1526870016 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2437563, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.715134859085083, + "objective/train/theoretical_loss": 3.5105769937541638, + "objective/train/tokens_used": 1547395552, + "theoretical_loss": 3.5105769937541638, + "tokens_seen": 1526935552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027141424272818454, + "loss": 2.7436, + "theoretical_loss": 3.5105769937541638, + "tokens_seen": 1526935552 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002714042126379138, + "loss": 2.8324, + "theoretical_loss": 3.510563754600268, + "tokens_seen": 1527001088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002713941825476429, + "loss": 2.8015, + "theoretical_loss": 3.5105505161736477, + "tokens_seen": 1527066624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027138415245737214, + "loss": 2.8082, + "theoretical_loss": 3.5105372784742315, + "tokens_seen": 1527132160 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027137412236710127, + "loss": 2.8538, + "theoretical_loss": 3.5105240415019487, + "tokens_seen": 1527197696 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002713640922768305, + "loss": 2.7869, + "theoretical_loss": 3.5105108052567275, + "tokens_seen": 1527263232 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002713540621865597, + "loss": 2.7847, + "theoretical_loss": 3.5104975697384972, + "tokens_seen": 1527328768 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027134403209628887, + "loss": 2.7906, + "theoretical_loss": 3.5104843349471864, + "tokens_seen": 1527394304 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027133400200601805, + "loss": 2.7987, + "theoretical_loss": 3.510471100882724, + "tokens_seen": 1527459840 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002713239719157473, + "loss": 2.7196, + "theoretical_loss": 3.510457867545039, + "tokens_seen": 1527525376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002713139418254764, + "loss": 2.8619, + "theoretical_loss": 3.5104446349340606, + "tokens_seen": 1527590912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027130391173520564, + "loss": 2.6564, + "theoretical_loss": 3.5104314030497172, + "tokens_seen": 1527656448 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027129388164493477, + "loss": 2.8272, + "theoretical_loss": 3.510418171891938, + "tokens_seen": 1527721984 + }, + { + "epoch": 4.05, + "learning_rate": 0.000271283851554664, + "loss": 2.8737, + "theoretical_loss": 3.510404941460652, + "tokens_seen": 1527787520 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002712738214643932, + "loss": 2.788, + "theoretical_loss": 3.510391711755788, + "tokens_seen": 1527853056 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027126379137412237, + "loss": 2.8274, + "theoretical_loss": 3.5103784827772753, + "tokens_seen": 1527918592 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027125376128385155, + "loss": 3.0172, + "theoretical_loss": 3.510365254525042, + "tokens_seen": 1527984128 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027124373119358073, + "loss": 2.7997, + "theoretical_loss": 3.5103520269990183, + "tokens_seen": 1528049664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002712337011033099, + "loss": 2.6979, + "theoretical_loss": 3.510338800199132, + "tokens_seen": 1528115200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027122367101303915, + "loss": 2.8438, + "theoretical_loss": 3.5103255741253125, + "tokens_seen": 1528180736 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002712136409227683, + "loss": 2.8617, + "theoretical_loss": 3.5103123487774894, + "tokens_seen": 1528246272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002712036108324975, + "loss": 2.7195, + "theoretical_loss": 3.5102991241555914, + "tokens_seen": 1528311808 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002711935807422267, + "loss": 2.7324, + "theoretical_loss": 3.5102859002595475, + "tokens_seen": 1528377344 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027118355065195587, + "loss": 2.78, + "theoretical_loss": 3.510272677089286, + "tokens_seen": 1528442880 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027117352056168505, + "loss": 2.8421, + "theoretical_loss": 3.510259454644737, + "tokens_seen": 1528508416 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2440329, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.820415496826172, + "objective/train/theoretical_loss": 3.5102462329258293, + "objective/train/tokens_used": 1549033952, + "theoretical_loss": 3.5102462329258293, + "tokens_seen": 1528573952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027116349047141423, + "loss": 2.8852, + "theoretical_loss": 3.5102462329258293, + "tokens_seen": 1528573952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002711534603811434, + "loss": 2.7439, + "theoretical_loss": 3.5102330119324914, + "tokens_seen": 1528639488 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027114343029087265, + "loss": 2.7347, + "theoretical_loss": 3.510219791664653, + "tokens_seen": 1528705024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002711334002006018, + "loss": 3.0004, + "theoretical_loss": 3.510206572122243, + "tokens_seen": 1528770560 + }, + { + "epoch": 4.05, + "learning_rate": 0.000271123370110331, + "loss": 2.843, + "theoretical_loss": 3.5101933533051906, + "tokens_seen": 1528836096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027111334002006014, + "loss": 2.815, + "theoretical_loss": 3.5101801352134245, + "tokens_seen": 1528901632 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002711033099297894, + "loss": 2.8124, + "theoretical_loss": 3.510166917846874, + "tokens_seen": 1528967168 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027109327983951856, + "loss": 2.8739, + "theoretical_loss": 3.5101537012054687, + "tokens_seen": 1529032704 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027108324974924774, + "loss": 2.8017, + "theoretical_loss": 3.5101404852891376, + "tokens_seen": 1529098240 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710732196589769, + "loss": 2.8152, + "theoretical_loss": 3.510127270097809, + "tokens_seen": 1529163776 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710631895687061, + "loss": 2.8625, + "theoretical_loss": 3.510114055631413, + "tokens_seen": 1529229312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710531594784353, + "loss": 2.827, + "theoretical_loss": 3.5101008418898787, + "tokens_seen": 1529294848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710431293881645, + "loss": 2.9273, + "theoretical_loss": 3.510087628873135, + "tokens_seen": 1529360384 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027103309929789364, + "loss": 2.8841, + "theoretical_loss": 3.5100744165811104, + "tokens_seen": 1529425920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710230692076229, + "loss": 2.7025, + "theoretical_loss": 3.5100612050137356, + "tokens_seen": 1529491456 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002710130391173521, + "loss": 2.7761, + "theoretical_loss": 3.5100479941709386, + "tokens_seen": 1529556992 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027100300902708124, + "loss": 2.7692, + "theoretical_loss": 3.5100347840526496, + "tokens_seen": 1529622528 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002709929789368105, + "loss": 2.9083, + "theoretical_loss": 3.5100215746587966, + "tokens_seen": 1529688064 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002709829488465396, + "loss": 2.8266, + "theoretical_loss": 3.51000836598931, + "tokens_seen": 1529753600 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027097291875626884, + "loss": 2.7785, + "theoretical_loss": 3.509995158044118, + "tokens_seen": 1529819136 + }, + { + "epoch": 4.05, + "learning_rate": 0.000270962888665998, + "loss": 2.7887, + "theoretical_loss": 3.509981950823151, + "tokens_seen": 1529884672 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002709528585757272, + "loss": 2.8066, + "theoretical_loss": 3.509968744326337, + "tokens_seen": 1529950208 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002709428284854564, + "loss": 2.91, + "theoretical_loss": 3.5099555385536068, + "tokens_seen": 1530015744 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027093279839518556, + "loss": 3.0106, + "theoretical_loss": 3.5099423335048883, + "tokens_seen": 1530081280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027092276830491474, + "loss": 2.8018, + "theoretical_loss": 3.5099291291801116, + "tokens_seen": 1530146816 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2442464, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.70875883102417, + "objective/train/theoretical_loss": 3.5099159255792056, + "objective/train/tokens_used": 1550672352, + "theoretical_loss": 3.5099159255792056, + "tokens_seen": 1530212352 + }, + { + "epoch": 4.05, + "learning_rate": 0.000270912738214644, + "loss": 2.8796, + "theoretical_loss": 3.5099159255792056, + "tokens_seen": 1530212352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002709027081243731, + "loss": 2.8187, + "theoretical_loss": 3.5099027227021002, + "tokens_seen": 1530277888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027089267803410234, + "loss": 2.8254, + "theoretical_loss": 3.5098895205487235, + "tokens_seen": 1530343424 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027088264794383147, + "loss": 2.9814, + "theoretical_loss": 3.509876319119006, + "tokens_seen": 1530408960 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002708726178535607, + "loss": 2.8366, + "theoretical_loss": 3.509863118412877, + "tokens_seen": 1530474496 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002708625877632899, + "loss": 2.891, + "theoretical_loss": 3.509849918430265, + "tokens_seen": 1530540032 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027085255767301907, + "loss": 2.9494, + "theoretical_loss": 3.5098367191711004, + "tokens_seen": 1530605568 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027084252758274825, + "loss": 2.7471, + "theoretical_loss": 3.5098235206353117, + "tokens_seen": 1530671104 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002708324974924775, + "loss": 2.7704, + "theoretical_loss": 3.5098103228228292, + "tokens_seen": 1530736640 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002708224674022066, + "loss": 2.9247, + "theoretical_loss": 3.5097971257335816, + "tokens_seen": 1530802176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027081243731193584, + "loss": 2.7857, + "theoretical_loss": 3.509783929367498, + "tokens_seen": 1530867712 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027080240722166497, + "loss": 2.6416, + "theoretical_loss": 3.5097707337245088, + "tokens_seen": 1530933248 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707923771313942, + "loss": 2.7824, + "theoretical_loss": 3.5097575388045428, + "tokens_seen": 1530998784 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707823470411234, + "loss": 2.8624, + "theoretical_loss": 3.5097443446075296, + "tokens_seen": 1531064320 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027077231695085257, + "loss": 2.9178, + "theoretical_loss": 3.509731151133398, + "tokens_seen": 1531129856 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027076228686058175, + "loss": 2.9405, + "theoretical_loss": 3.509717958382079, + "tokens_seen": 1531195392 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027075225677031093, + "loss": 2.882, + "theoretical_loss": 3.5097047663535004, + "tokens_seen": 1531260928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707422266800401, + "loss": 2.7654, + "theoretical_loss": 3.5096915750475928, + "tokens_seen": 1531326464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027073219658976935, + "loss": 2.9431, + "theoretical_loss": 3.509678384464285, + "tokens_seen": 1531392000 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707221664994985, + "loss": 2.8234, + "theoretical_loss": 3.5096651946035067, + "tokens_seen": 1531457536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707121364092277, + "loss": 2.9141, + "theoretical_loss": 3.5096520054651883, + "tokens_seen": 1531523072 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002707021063189569, + "loss": 2.7097, + "theoretical_loss": 3.5096388170492574, + "tokens_seen": 1531588608 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027069207622868607, + "loss": 2.6719, + "theoretical_loss": 3.5096256293556456, + "tokens_seen": 1531654144 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027068204613841525, + "loss": 2.8096, + "theoretical_loss": 3.509612442384281, + "tokens_seen": 1531719680 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027067201604814443, + "loss": 3.0574, + "theoretical_loss": 3.5095992561350933, + "tokens_seen": 1531785216 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2445369, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.928218364715576, + "objective/train/theoretical_loss": 3.5095860706080124, + "objective/train/tokens_used": 1552310752, + "theoretical_loss": 3.5095860706080124, + "tokens_seen": 1531850752 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002706619859578736, + "loss": 2.912, + "theoretical_loss": 3.5095860706080124, + "tokens_seen": 1531850752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027065195586760285, + "loss": 2.9059, + "theoretical_loss": 3.5095728858029682, + "tokens_seen": 1531916288 + }, + { + "epoch": 4.05, + "learning_rate": 0.000270641925777332, + "loss": 2.6433, + "theoretical_loss": 3.5095597017198905, + "tokens_seen": 1531981824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002706318956870612, + "loss": 2.7965, + "theoretical_loss": 3.5095465183587073, + "tokens_seen": 1532047360 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027062186559679034, + "loss": 2.9266, + "theoretical_loss": 3.5095333357193494, + "tokens_seen": 1532112896 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002706118355065196, + "loss": 2.7567, + "theoretical_loss": 3.5095201538017466, + "tokens_seen": 1532178432 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027060180541624876, + "loss": 2.8574, + "theoretical_loss": 3.5095069726058274, + "tokens_seen": 1532243968 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027059177532597794, + "loss": 2.9706, + "theoretical_loss": 3.5094937921315226, + "tokens_seen": 1532309504 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705817452357071, + "loss": 2.7622, + "theoretical_loss": 3.5094806123787614, + "tokens_seen": 1532375040 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705717151454363, + "loss": 2.7685, + "theoretical_loss": 3.5094674333474734, + "tokens_seen": 1532440576 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705616850551655, + "loss": 2.7968, + "theoretical_loss": 3.5094542550375882, + "tokens_seen": 1532506112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705516549648947, + "loss": 2.9059, + "theoretical_loss": 3.5094410774490354, + "tokens_seen": 1532571648 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027054162487462384, + "loss": 2.7045, + "theoretical_loss": 3.509427900581745, + "tokens_seen": 1532637184 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705315947843531, + "loss": 2.8599, + "theoretical_loss": 3.509414724435646, + "tokens_seen": 1532702720 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027052156469408226, + "loss": 2.7558, + "theoretical_loss": 3.509401549010669, + "tokens_seen": 1532768256 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027051153460381144, + "loss": 2.8156, + "theoretical_loss": 3.5093883743067433, + "tokens_seen": 1532833792 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002705015045135406, + "loss": 2.8926, + "theoretical_loss": 3.5093752003237983, + "tokens_seen": 1532899328 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704914744232698, + "loss": 2.9035, + "theoretical_loss": 3.5093620270617643, + "tokens_seen": 1532964864 + }, + { + "epoch": 4.05, + "learning_rate": 0.000270481444332999, + "loss": 2.6562, + "theoretical_loss": 3.509348854520571, + "tokens_seen": 1533030400 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704714142427282, + "loss": 2.7245, + "theoretical_loss": 3.509335682700147, + "tokens_seen": 1533095936 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027046138415245735, + "loss": 2.7575, + "theoretical_loss": 3.5093225116004234, + "tokens_seen": 1533161472 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704513540621866, + "loss": 2.813, + "theoretical_loss": 3.50930934122133, + "tokens_seen": 1533227008 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704413239719157, + "loss": 2.8026, + "theoretical_loss": 3.5092961715627955, + "tokens_seen": 1533292544 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027043129388164494, + "loss": 2.8897, + "theoretical_loss": 3.5092830026247506, + "tokens_seen": 1533358080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704212637913741, + "loss": 2.715, + "theoretical_loss": 3.5092698344071245, + "tokens_seen": 1533423616 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2448031, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.090136766433716, + "objective/train/theoretical_loss": 3.5092566669098475, + "objective/train/tokens_used": 1553949152, + "theoretical_loss": 3.5092566669098475, + "tokens_seen": 1533489152 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704112337011033, + "loss": 2.9002, + "theoretical_loss": 3.5092566669098475, + "tokens_seen": 1533489152 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002704012036108325, + "loss": 2.9469, + "theoretical_loss": 3.5092435001328486, + "tokens_seen": 1533554688 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027039117352056167, + "loss": 2.7824, + "theoretical_loss": 3.509230334076059, + "tokens_seen": 1533620224 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027038114343029085, + "loss": 2.818, + "theoretical_loss": 3.509217168739407, + "tokens_seen": 1533685760 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002703711133400201, + "loss": 2.9373, + "theoretical_loss": 3.5092040041228234, + "tokens_seen": 1533751296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002703610832497492, + "loss": 2.9801, + "theoretical_loss": 3.509190840226238, + "tokens_seen": 1533816832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027035105315947845, + "loss": 2.8316, + "theoretical_loss": 3.50917767704958, + "tokens_seen": 1533882368 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027034102306920763, + "loss": 2.9168, + "theoretical_loss": 3.5091645145927806, + "tokens_seen": 1533947904 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002703309929789368, + "loss": 2.9465, + "theoretical_loss": 3.5091513528557683, + "tokens_seen": 1534013440 + }, + { + "epoch": 4.05, + "learning_rate": 0.000270320962888666, + "loss": 2.7961, + "theoretical_loss": 3.5091381918384736, + "tokens_seen": 1534078976 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027031093279839517, + "loss": 2.8152, + "theoretical_loss": 3.509125031540826, + "tokens_seen": 1534144512 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027030090270812435, + "loss": 2.7827, + "theoretical_loss": 3.509111871962756, + "tokens_seen": 1534210048 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702908726178536, + "loss": 2.9023, + "theoretical_loss": 3.5090987131041933, + "tokens_seen": 1534275584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702808425275827, + "loss": 2.9058, + "theoretical_loss": 3.5090855549650675, + "tokens_seen": 1534341120 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027027081243731195, + "loss": 2.7521, + "theoretical_loss": 3.5090723975453093, + "tokens_seen": 1534406656 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027026078234704113, + "loss": 2.8695, + "theoretical_loss": 3.5090592408448478, + "tokens_seen": 1534472192 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702507522567703, + "loss": 2.9103, + "theoretical_loss": 3.5090460848636136, + "tokens_seen": 1534537728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027024072216649955, + "loss": 2.6495, + "theoretical_loss": 3.5090329296015357, + "tokens_seen": 1534603264 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702306920762287, + "loss": 2.8507, + "theoretical_loss": 3.5090197750585457, + "tokens_seen": 1534668800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702206619859579, + "loss": 2.8116, + "theoretical_loss": 3.5090066212345716, + "tokens_seen": 1534734336 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002702106318956871, + "loss": 2.7387, + "theoretical_loss": 3.508993468129545, + "tokens_seen": 1534799872 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027020060180541627, + "loss": 2.973, + "theoretical_loss": 3.5089803157433956, + "tokens_seen": 1534865408 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027019057171514545, + "loss": 2.6965, + "theoretical_loss": 3.5089671640760534, + "tokens_seen": 1534930944 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027018054162487463, + "loss": 2.7478, + "theoretical_loss": 3.5089540131274477, + "tokens_seen": 1534996480 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002701705115346038, + "loss": 2.794, + "theoretical_loss": 3.508940862897509, + "tokens_seen": 1535062016 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2449490, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.799323558807373, + "objective/train/theoretical_loss": 3.5089277133861674, + "objective/train/tokens_used": 1555587552, + "theoretical_loss": 3.5089277133861674, + "tokens_seen": 1535127552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027016048144433305, + "loss": 2.8607, + "theoretical_loss": 3.5089277133861674, + "tokens_seen": 1535127552 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002701504513540622, + "loss": 2.8508, + "theoretical_loss": 3.5089145645933533, + "tokens_seen": 1535193088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002701404212637914, + "loss": 2.7966, + "theoretical_loss": 3.508901416518996, + "tokens_seen": 1535258624 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027013039117352054, + "loss": 2.799, + "theoretical_loss": 3.5088882691630263, + "tokens_seen": 1535324160 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002701203610832498, + "loss": 2.7879, + "theoretical_loss": 3.5088751225253736, + "tokens_seen": 1535389696 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027011033099297896, + "loss": 2.8398, + "theoretical_loss": 3.508861976605969, + "tokens_seen": 1535455232 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027010030090270814, + "loss": 2.8512, + "theoretical_loss": 3.5088488314047415, + "tokens_seen": 1535520768 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700902708124373, + "loss": 2.9236, + "theoretical_loss": 3.5088356869216217, + "tokens_seen": 1535586304 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700802407221665, + "loss": 2.9193, + "theoretical_loss": 3.5088225431565396, + "tokens_seen": 1535651840 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700702106318957, + "loss": 2.8843, + "theoretical_loss": 3.508809400109426, + "tokens_seen": 1535717376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700601805416249, + "loss": 2.949, + "theoretical_loss": 3.50879625778021, + "tokens_seen": 1535782912 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027005015045135404, + "loss": 2.8167, + "theoretical_loss": 3.5087831161688223, + "tokens_seen": 1535848448 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700401203610833, + "loss": 2.8005, + "theoretical_loss": 3.5087699752751935, + "tokens_seen": 1535913984 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027003009027081246, + "loss": 2.7965, + "theoretical_loss": 3.508756835099253, + "tokens_seen": 1535979520 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027002006018054164, + "loss": 2.8653, + "theoretical_loss": 3.5087436956409315, + "tokens_seen": 1536045056 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002700100300902708, + "loss": 2.7572, + "theoretical_loss": 3.5087305569001583, + "tokens_seen": 1536110592 + }, + { + "epoch": 4.05, + "learning_rate": 0.00027, + "loss": 2.7268, + "theoretical_loss": 3.5087174188768646, + "tokens_seen": 1536176128 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699899699097292, + "loss": 2.8026, + "theoretical_loss": 3.5087042815709806, + "tokens_seen": 1536241664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699799398194584, + "loss": 2.9605, + "theoretical_loss": 3.508691144982436, + "tokens_seen": 1536307200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026996990972918755, + "loss": 2.7238, + "theoretical_loss": 3.5086780091111613, + "tokens_seen": 1536372736 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699598796389168, + "loss": 2.7887, + "theoretical_loss": 3.5086648739570867, + "tokens_seen": 1536438272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699498495486459, + "loss": 2.9334, + "theoretical_loss": 3.5086517395201424, + "tokens_seen": 1536503808 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026993981945837514, + "loss": 2.7872, + "theoretical_loss": 3.5086386058002588, + "tokens_seen": 1536569344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699297893681043, + "loss": 2.8415, + "theoretical_loss": 3.508625472797366, + "tokens_seen": 1536634880 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699197592778335, + "loss": 2.7233, + "theoretical_loss": 3.508612340511394, + "tokens_seen": 1536700416 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2452229, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8409132957458496, + "objective/train/theoretical_loss": 3.508599208942274, + "objective/train/tokens_used": 1557225952, + "theoretical_loss": 3.508599208942274, + "tokens_seen": 1536765952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002699097291875627, + "loss": 2.7541, + "theoretical_loss": 3.508599208942274, + "tokens_seen": 1536765952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026989969909729187, + "loss": 2.8158, + "theoretical_loss": 3.5085860780899356, + "tokens_seen": 1536831488 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026988966900702105, + "loss": 2.8072, + "theoretical_loss": 3.5085729479543093, + "tokens_seen": 1536897024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002698796389167503, + "loss": 2.9336, + "theoretical_loss": 3.508559818535325, + "tokens_seen": 1536962560 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002698696088264794, + "loss": 2.9371, + "theoretical_loss": 3.508546689832914, + "tokens_seen": 1537028096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026985957873620865, + "loss": 2.7373, + "theoretical_loss": 3.5085335618470057, + "tokens_seen": 1537093632 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026984954864593783, + "loss": 2.9485, + "theoretical_loss": 3.508520434577531, + "tokens_seen": 1537159168 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269839518555667, + "loss": 2.9261, + "theoretical_loss": 3.50850730802442, + "tokens_seen": 1537224704 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002698294884653962, + "loss": 2.9539, + "theoretical_loss": 3.508494182187603, + "tokens_seen": 1537290240 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026981945837512537, + "loss": 2.8222, + "theoretical_loss": 3.5084810570670104, + "tokens_seen": 1537355776 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026980942828485455, + "loss": 2.5842, + "theoretical_loss": 3.508467932662573, + "tokens_seen": 1537421312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697993981945838, + "loss": 2.7616, + "theoretical_loss": 3.5084548089742205, + "tokens_seen": 1537486848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697893681043129, + "loss": 2.761, + "theoretical_loss": 3.508441686001884, + "tokens_seen": 1537552384 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026977933801404215, + "loss": 2.7138, + "theoretical_loss": 3.508428563745494, + "tokens_seen": 1537617920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697693079237713, + "loss": 2.6668, + "theoretical_loss": 3.5084154422049805, + "tokens_seen": 1537683456 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697592778335005, + "loss": 2.8486, + "theoretical_loss": 3.5084023213802737, + "tokens_seen": 1537748992 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697492477432297, + "loss": 2.6806, + "theoretical_loss": 3.508389201271304, + "tokens_seen": 1537814528 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697392176529589, + "loss": 2.7905, + "theoretical_loss": 3.5083760818780023, + "tokens_seen": 1537880064 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026972918756268806, + "loss": 2.7932, + "theoretical_loss": 3.5083629632002995, + "tokens_seen": 1537945600 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697191574724173, + "loss": 2.878, + "theoretical_loss": 3.5083498452381257, + "tokens_seen": 1538011136 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002697091273821464, + "loss": 2.8387, + "theoretical_loss": 3.50833672799141, + "tokens_seen": 1538076672 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026969909729187565, + "loss": 2.864, + "theoretical_loss": 3.508323611460085, + "tokens_seen": 1538142208 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002696890672016048, + "loss": 2.7723, + "theoretical_loss": 3.5083104956440803, + "tokens_seen": 1538207744 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269679037111334, + "loss": 2.8017, + "theoretical_loss": 3.5082973805433264, + "tokens_seen": 1538273280 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002696690070210632, + "loss": 3.0398, + "theoretical_loss": 3.5082842661577542, + "tokens_seen": 1538338816 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2454703, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5318868160247803, + "objective/train/theoretical_loss": 3.5082711524872936, + "objective/train/tokens_used": 1558864352, + "theoretical_loss": 3.5082711524872936, + "tokens_seen": 1538404352 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002696589769307924, + "loss": 2.8561, + "theoretical_loss": 3.5082711524872936, + "tokens_seen": 1538404352 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026964894684052156, + "loss": 2.7952, + "theoretical_loss": 3.508258039531875, + "tokens_seen": 1538469888 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026963891675025074, + "loss": 2.8167, + "theoretical_loss": 3.50824492729143, + "tokens_seen": 1538535424 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002696288866599799, + "loss": 2.7331, + "theoretical_loss": 3.5082318157658885, + "tokens_seen": 1538600960 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026961885656970916, + "loss": 2.907, + "theoretical_loss": 3.5082187049551807, + "tokens_seen": 1538666496 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002696088264794383, + "loss": 2.8498, + "theoretical_loss": 3.508205594859238, + "tokens_seen": 1538732032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002695987963891675, + "loss": 2.8687, + "theoretical_loss": 3.508192485477991, + "tokens_seen": 1538797568 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026958876629889665, + "loss": 2.7949, + "theoretical_loss": 3.5081793768113694, + "tokens_seen": 1538863104 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002695787362086259, + "loss": 2.8678, + "theoretical_loss": 3.5081662688593047, + "tokens_seen": 1538928640 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026956870611835506, + "loss": 2.8696, + "theoretical_loss": 3.5081531616217267, + "tokens_seen": 1538994176 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026955867602808424, + "loss": 2.9257, + "theoretical_loss": 3.5081400550985666, + "tokens_seen": 1539059712 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002695486459378134, + "loss": 2.8809, + "theoretical_loss": 3.508126949289755, + "tokens_seen": 1539125248 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026953861584754266, + "loss": 2.7539, + "theoretical_loss": 3.5081138441952224, + "tokens_seen": 1539190784 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002695285857572718, + "loss": 2.8716, + "theoretical_loss": 3.5081007398149, + "tokens_seen": 1539256320 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269518555667001, + "loss": 2.9172, + "theoretical_loss": 3.5080876361487174, + "tokens_seen": 1539321856 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002695085255767302, + "loss": 2.9192, + "theoretical_loss": 3.508074533196606, + "tokens_seen": 1539387392 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694984954864594, + "loss": 2.9092, + "theoretical_loss": 3.5080614309584965, + "tokens_seen": 1539452928 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694884653961886, + "loss": 2.7289, + "theoretical_loss": 3.5080483294343194, + "tokens_seen": 1539518464 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026947843530591775, + "loss": 3.0006, + "theoretical_loss": 3.5080352286240055, + "tokens_seen": 1539584000 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269468405215647, + "loss": 2.691, + "theoretical_loss": 3.508022128527486, + "tokens_seen": 1539649536 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694583751253761, + "loss": 2.765, + "theoretical_loss": 3.5080090291446906, + "tokens_seen": 1539715072 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026944834503510534, + "loss": 2.7182, + "theoretical_loss": 3.507995930475551, + "tokens_seen": 1539780608 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694383149448345, + "loss": 2.7646, + "theoretical_loss": 3.507982832519997, + "tokens_seen": 1539846144 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694282848545637, + "loss": 2.8371, + "theoretical_loss": 3.5079697352779604, + "tokens_seen": 1539911680 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002694182547642929, + "loss": 2.5467, + "theoretical_loss": 3.507956638749371, + "tokens_seen": 1539977216 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2457638, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.800381660461426, + "objective/train/theoretical_loss": 3.50794354293416, + "objective/train/tokens_used": 1560502752, + "theoretical_loss": 3.50794354293416, + "tokens_seen": 1540042752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026940822467402207, + "loss": 2.7507, + "theoretical_loss": 3.50794354293416, + "tokens_seen": 1540042752 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026939819458375125, + "loss": 2.886, + "theoretical_loss": 3.5079304478322584, + "tokens_seen": 1540108288 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002693881644934805, + "loss": 2.7581, + "theoretical_loss": 3.507917353443597, + "tokens_seen": 1540173824 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002693781344032096, + "loss": 2.7662, + "theoretical_loss": 3.5079042597681065, + "tokens_seen": 1540239360 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026936810431293885, + "loss": 2.8778, + "theoretical_loss": 3.507891166805717, + "tokens_seen": 1540304896 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026935807422266803, + "loss": 2.8218, + "theoretical_loss": 3.5078780745563605, + "tokens_seen": 1540370432 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002693480441323972, + "loss": 2.9843, + "theoretical_loss": 3.5078649830199673, + "tokens_seen": 1540435968 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002693380140421264, + "loss": 2.9074, + "theoretical_loss": 3.507851892196468, + "tokens_seen": 1540501504 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026932798395185557, + "loss": 2.8728, + "theoretical_loss": 3.507838802085794, + "tokens_seen": 1540567040 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026931795386158475, + "loss": 2.8873, + "theoretical_loss": 3.5078257126878754, + "tokens_seen": 1540632576 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269307923771314, + "loss": 2.7861, + "theoretical_loss": 3.5078126240026437, + "tokens_seen": 1540698112 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692978936810431, + "loss": 2.6614, + "theoretical_loss": 3.5077995360300296, + "tokens_seen": 1540763648 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026928786359077235, + "loss": 2.8811, + "theoretical_loss": 3.5077864487699637, + "tokens_seen": 1540829184 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692778335005015, + "loss": 2.7649, + "theoretical_loss": 3.5077733622223777, + "tokens_seen": 1540894720 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692678034102307, + "loss": 2.8556, + "theoretical_loss": 3.5077602763872022, + "tokens_seen": 1540960256 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692577733199599, + "loss": 2.8518, + "theoretical_loss": 3.507747191264367, + "tokens_seen": 1541025792 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692477432296891, + "loss": 2.8301, + "theoretical_loss": 3.507734106853805, + "tokens_seen": 1541091328 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026923771313941826, + "loss": 2.8807, + "theoretical_loss": 3.5077210231554456, + "tokens_seen": 1541156864 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692276830491475, + "loss": 2.7623, + "theoretical_loss": 3.5077079401692206, + "tokens_seen": 1541222400 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002692176529588766, + "loss": 2.744, + "theoretical_loss": 3.50769485789506, + "tokens_seen": 1541287936 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026920762286860585, + "loss": 2.7583, + "theoretical_loss": 3.507681776332896, + "tokens_seen": 1541353472 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269197592778335, + "loss": 2.826, + "theoretical_loss": 3.507668695482659, + "tokens_seen": 1541419008 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691875626880642, + "loss": 2.7697, + "theoretical_loss": 3.5076556153442793, + "tokens_seen": 1541484544 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691775325977934, + "loss": 2.741, + "theoretical_loss": 3.507642535917689, + "tokens_seen": 1541550080 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691675025075226, + "loss": 2.8699, + "theoretical_loss": 3.5076294572028184, + "tokens_seen": 1541615616 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2460385, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8386573791503906, + "objective/train/theoretical_loss": 3.5076163791995993, + "objective/train/tokens_used": 1562141152, + "theoretical_loss": 3.5076163791995993, + "tokens_seen": 1541681152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026915747241725176, + "loss": 2.8687, + "theoretical_loss": 3.5076163791995993, + "tokens_seen": 1541681152 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026914744232698094, + "loss": 2.8745, + "theoretical_loss": 3.507603301907962, + "tokens_seen": 1541746688 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691374122367101, + "loss": 2.734, + "theoretical_loss": 3.5075902253278377, + "tokens_seen": 1541812224 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026912738214643936, + "loss": 2.8304, + "theoretical_loss": 3.5075771494591574, + "tokens_seen": 1541877760 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691173520561685, + "loss": 2.7157, + "theoretical_loss": 3.507564074301852, + "tokens_seen": 1541943296 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002691073219658977, + "loss": 2.6845, + "theoretical_loss": 3.5075509998558534, + "tokens_seen": 1542008832 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026909729187562685, + "loss": 2.8011, + "theoretical_loss": 3.507537926121092, + "tokens_seen": 1542074368 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002690872617853561, + "loss": 2.8731, + "theoretical_loss": 3.5075248530974985, + "tokens_seen": 1542139904 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026907723169508526, + "loss": 2.768, + "theoretical_loss": 3.507511780785005, + "tokens_seen": 1542205440 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026906720160481444, + "loss": 2.8533, + "theoretical_loss": 3.507498709183542, + "tokens_seen": 1542270976 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002690571715145436, + "loss": 2.7546, + "theoretical_loss": 3.5074856382930406, + "tokens_seen": 1542336512 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026904714142427286, + "loss": 2.8237, + "theoretical_loss": 3.5074725681134318, + "tokens_seen": 1542402048 + }, + { + "epoch": 4.05, + "learning_rate": 0.000269037111334002, + "loss": 2.7342, + "theoretical_loss": 3.5074594986446472, + "tokens_seen": 1542467584 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002690270812437312, + "loss": 2.8805, + "theoretical_loss": 3.5074464298866177, + "tokens_seen": 1542533120 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026901705115346035, + "loss": 2.7583, + "theoretical_loss": 3.507433361839275, + "tokens_seen": 1542598656 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002690070210631896, + "loss": 2.7993, + "theoretical_loss": 3.5074202945025483, + "tokens_seen": 1542664192 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026899699097291877, + "loss": 2.9206, + "theoretical_loss": 3.5074072278763713, + "tokens_seen": 1542729728 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026898696088264795, + "loss": 2.775, + "theoretical_loss": 3.507394161960674, + "tokens_seen": 1542795264 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026897693079237713, + "loss": 2.8425, + "theoretical_loss": 3.5073810967553873, + "tokens_seen": 1542860800 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002689669007021063, + "loss": 2.7451, + "theoretical_loss": 3.507368032260443, + "tokens_seen": 1542926336 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002689568706118355, + "loss": 2.9784, + "theoretical_loss": 3.5073549684757714, + "tokens_seen": 1542991872 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002689468405215647, + "loss": 2.8845, + "theoretical_loss": 3.507341905401305, + "tokens_seen": 1543057408 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026893681043129385, + "loss": 2.7674, + "theoretical_loss": 3.5073288430369747, + "tokens_seen": 1543122944 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002689267803410231, + "loss": 2.8103, + "theoretical_loss": 3.5073157813827107, + "tokens_seen": 1543188480 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002689167502507522, + "loss": 2.7684, + "theoretical_loss": 3.507302720438445, + "tokens_seen": 1543254016 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2463209, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9133267402648926, + "objective/train/theoretical_loss": 3.5072896602041093, + "objective/train/tokens_used": 1563779552, + "theoretical_loss": 3.5072896602041093, + "tokens_seen": 1543319552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026890672016048145, + "loss": 2.7628, + "theoretical_loss": 3.5072896602041093, + "tokens_seen": 1543319552 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026889669007021063, + "loss": 2.8506, + "theoretical_loss": 3.5072766006796345, + "tokens_seen": 1543385088 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002688866599799398, + "loss": 2.8027, + "theoretical_loss": 3.507263541864952, + "tokens_seen": 1543450624 + }, + { + "epoch": 4.05, + "learning_rate": 0.000268876629889669, + "loss": 2.7403, + "theoretical_loss": 3.507250483759992, + "tokens_seen": 1543516160 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026886659979939823, + "loss": 2.8383, + "theoretical_loss": 3.5072374263646875, + "tokens_seen": 1543581696 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026885656970912736, + "loss": 2.7179, + "theoretical_loss": 3.5072243696789687, + "tokens_seen": 1543647232 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002688465396188566, + "loss": 2.5985, + "theoretical_loss": 3.507211313702767, + "tokens_seen": 1543712768 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002688365095285857, + "loss": 2.7988, + "theoretical_loss": 3.5071982584360146, + "tokens_seen": 1543778304 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026882647943831495, + "loss": 2.6967, + "theoretical_loss": 3.5071852038786417, + "tokens_seen": 1543843840 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026881644934804413, + "loss": 2.8021, + "theoretical_loss": 3.5071721500305797, + "tokens_seen": 1543909376 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002688064192577733, + "loss": 2.7118, + "theoretical_loss": 3.507159096891761, + "tokens_seen": 1543974912 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687963891675025, + "loss": 2.7397, + "theoretical_loss": 3.5071460444621163, + "tokens_seen": 1544040448 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687863590772317, + "loss": 2.5713, + "theoretical_loss": 3.5071329927415764, + "tokens_seen": 1544105984 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026877632898696086, + "loss": 2.8656, + "theoretical_loss": 3.507119941730074, + "tokens_seen": 1544171520 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687662988966901, + "loss": 2.9072, + "theoretical_loss": 3.507106891427539, + "tokens_seen": 1544237056 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687562688064193, + "loss": 2.8106, + "theoretical_loss": 3.507093841833904, + "tokens_seen": 1544302592 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026874623871614846, + "loss": 2.787, + "theoretical_loss": 3.5070807929491004, + "tokens_seen": 1544368128 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687362086258777, + "loss": 2.8349, + "theoretical_loss": 3.5070677447730585, + "tokens_seen": 1544433664 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687261785356068, + "loss": 2.8191, + "theoretical_loss": 3.507054697305711, + "tokens_seen": 1544499200 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026871614844533605, + "loss": 2.8467, + "theoretical_loss": 3.5070416505469884, + "tokens_seen": 1544564736 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002687061183550652, + "loss": 2.8863, + "theoretical_loss": 3.5070286044968224, + "tokens_seen": 1544630272 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686960882647944, + "loss": 2.9367, + "theoretical_loss": 3.5070155591551453, + "tokens_seen": 1544695808 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686860581745236, + "loss": 2.8936, + "theoretical_loss": 3.5070025145218873, + "tokens_seen": 1544761344 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686760280842528, + "loss": 2.6989, + "theoretical_loss": 3.50698947059698, + "tokens_seen": 1544826880 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026866599799398196, + "loss": 2.8046, + "theoretical_loss": 3.506976427380356, + "tokens_seen": 1544892416 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2466092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.001208782196045, + "objective/train/theoretical_loss": 3.5069633848719457, + "objective/train/tokens_used": 1565417952, + "theoretical_loss": 3.5069633848719457, + "tokens_seen": 1544957952 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026865596790371114, + "loss": 2.8525, + "theoretical_loss": 3.5069633848719457, + "tokens_seen": 1544957952 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686459378134403, + "loss": 2.7435, + "theoretical_loss": 3.5069503430716815, + "tokens_seen": 1545023488 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026863590772316956, + "loss": 2.7826, + "theoretical_loss": 3.506937301979494, + "tokens_seen": 1545089024 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686258776328987, + "loss": 2.9616, + "theoretical_loss": 3.5069242615953153, + "tokens_seen": 1545154560 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002686158475426279, + "loss": 2.868, + "theoretical_loss": 3.506911221919077, + "tokens_seen": 1545220096 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026860581745235705, + "loss": 2.7315, + "theoretical_loss": 3.50689818295071, + "tokens_seen": 1545285632 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002685957873620863, + "loss": 2.6596, + "theoretical_loss": 3.5068851446901466, + "tokens_seen": 1545351168 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026858575727181546, + "loss": 2.7553, + "theoretical_loss": 3.506872107137318, + "tokens_seen": 1545416704 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026857572718154464, + "loss": 2.7059, + "theoretical_loss": 3.506859070292156, + "tokens_seen": 1545482240 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002685656970912738, + "loss": 2.7334, + "theoretical_loss": 3.506846034154592, + "tokens_seen": 1545547776 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026855566700100306, + "loss": 2.824, + "theoretical_loss": 3.5068329987245574, + "tokens_seen": 1545613312 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002685456369107322, + "loss": 2.8928, + "theoretical_loss": 3.506819964001984, + "tokens_seen": 1545678848 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002685356068204614, + "loss": 2.8519, + "theoretical_loss": 3.506806929986803, + "tokens_seen": 1545744384 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026852557673019055, + "loss": 2.7681, + "theoretical_loss": 3.506793896678947, + "tokens_seen": 1545809920 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002685155466399198, + "loss": 2.8143, + "theoretical_loss": 3.5067808640783475, + "tokens_seen": 1545875456 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026850551654964897, + "loss": 2.6825, + "theoretical_loss": 3.506767832184935, + "tokens_seen": 1545940992 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026849548645937815, + "loss": 2.8626, + "theoretical_loss": 3.506754800998642, + "tokens_seen": 1546006528 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026848545636910733, + "loss": 2.7618, + "theoretical_loss": 3.5067417705194, + "tokens_seen": 1546072064 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002684754262788365, + "loss": 2.7101, + "theoretical_loss": 3.506728740747141, + "tokens_seen": 1546137600 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002684653961885657, + "loss": 2.6256, + "theoretical_loss": 3.506715711681796, + "tokens_seen": 1546203136 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002684553660982949, + "loss": 2.8859, + "theoretical_loss": 3.506702683323297, + "tokens_seen": 1546268672 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026844533600802405, + "loss": 2.935, + "theoretical_loss": 3.5066896556715754, + "tokens_seen": 1546334208 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002684353059177533, + "loss": 2.79, + "theoretical_loss": 3.506676628726564, + "tokens_seen": 1546399744 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002684252758274824, + "loss": 2.9048, + "theoretical_loss": 3.506663602488193, + "tokens_seen": 1546465280 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026841524573721165, + "loss": 2.8768, + "theoretical_loss": 3.506650576956396, + "tokens_seen": 1546530816 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2467587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7923061847686768, + "objective/train/theoretical_loss": 3.5066375521311026, + "objective/train/tokens_used": 1567056352, + "theoretical_loss": 3.5066375521311026, + "tokens_seen": 1546596352 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026840521564694083, + "loss": 2.7306, + "theoretical_loss": 3.5066375521311026, + "tokens_seen": 1546596352 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026839518555667, + "loss": 2.841, + "theoretical_loss": 3.506624528012246, + "tokens_seen": 1546661888 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002683851554663992, + "loss": 2.76, + "theoretical_loss": 3.5066115045997575, + "tokens_seen": 1546727424 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026837512537612843, + "loss": 2.9558, + "theoretical_loss": 3.506598481893569, + "tokens_seen": 1546792960 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026836509528585756, + "loss": 2.7083, + "theoretical_loss": 3.506585459893612, + "tokens_seen": 1546858496 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002683550651955868, + "loss": 2.8004, + "theoretical_loss": 3.506572438599818, + "tokens_seen": 1546924032 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002683450351053159, + "loss": 2.9146, + "theoretical_loss": 3.5065594180121202, + "tokens_seen": 1546989568 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026833500501504515, + "loss": 2.6223, + "theoretical_loss": 3.5065463981304488, + "tokens_seen": 1547055104 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026832497492477433, + "loss": 2.7825, + "theoretical_loss": 3.506533378954736, + "tokens_seen": 1547120640 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002683149448345035, + "loss": 2.8339, + "theoretical_loss": 3.506520360484914, + "tokens_seen": 1547186176 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002683049147442327, + "loss": 2.8749, + "theoretical_loss": 3.506507342720915, + "tokens_seen": 1547251712 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682948846539619, + "loss": 2.706, + "theoretical_loss": 3.50649432566267, + "tokens_seen": 1547317248 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026828485456369106, + "loss": 2.8591, + "theoretical_loss": 3.506481309310111, + "tokens_seen": 1547382784 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682748244734203, + "loss": 2.6603, + "theoretical_loss": 3.50646829366317, + "tokens_seen": 1547448320 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682647943831494, + "loss": 2.8457, + "theoretical_loss": 3.5064552787217798, + "tokens_seen": 1547513856 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026825476429287866, + "loss": 2.8378, + "theoretical_loss": 3.5064422644858704, + "tokens_seen": 1547579392 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682447342026078, + "loss": 2.8368, + "theoretical_loss": 3.506429250955375, + "tokens_seen": 1547644928 + }, + { + "epoch": 4.05, + "learning_rate": 0.000268234704112337, + "loss": 2.8827, + "theoretical_loss": 3.506416238130225, + "tokens_seen": 1547710464 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682246740220662, + "loss": 2.9672, + "theoretical_loss": 3.5064032260103524, + "tokens_seen": 1547776000 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002682146439317954, + "loss": 2.919, + "theoretical_loss": 3.50639021459569, + "tokens_seen": 1547841536 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026820461384152456, + "loss": 2.7583, + "theoretical_loss": 3.506377203886168, + "tokens_seen": 1547907072 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002681945837512538, + "loss": 2.9136, + "theoretical_loss": 3.506364193881719, + "tokens_seen": 1547972608 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002681845536609829, + "loss": 2.8714, + "theoretical_loss": 3.506351184582276, + "tokens_seen": 1548038144 + }, + { + "epoch": 4.05, + "learning_rate": 0.00026817452357071216, + "loss": 2.7796, + "theoretical_loss": 3.5063381759877696, + "tokens_seen": 1548103680 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002681644934804413, + "loss": 2.6165, + "theoretical_loss": 3.5063251680981327, + "tokens_seen": 1548169216 + }, + { + "epoch": 4.05, + "objective/train/docs_used": 2471324, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5822272300720215, + "objective/train/theoretical_loss": 3.5063121609132963, + "objective/train/tokens_used": 1568694752, + "theoretical_loss": 3.5063121609132963, + "tokens_seen": 1548234752 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002681544633901705, + "loss": 2.6237, + "theoretical_loss": 3.5063121609132963, + "tokens_seen": 1548234752 + }, + { + "epoch": 4.05, + "learning_rate": 0.0002681444332998997, + "loss": 2.8122, + "theoretical_loss": 3.5062991544331936, + "tokens_seen": 1548300288 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002681344032096289, + "loss": 2.7684, + "theoretical_loss": 3.5062861486577557, + "tokens_seen": 1548365824 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026812437311935807, + "loss": 2.7802, + "theoretical_loss": 3.5062731435869146, + "tokens_seen": 1548431360 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026811434302908725, + "loss": 2.6823, + "theoretical_loss": 3.5062601392206028, + "tokens_seen": 1548496896 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026810431293881643, + "loss": 2.8979, + "theoretical_loss": 3.506247135558752, + "tokens_seen": 1548562432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026809428284854566, + "loss": 2.8804, + "theoretical_loss": 3.5062341326012945, + "tokens_seen": 1548627968 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002680842527582748, + "loss": 2.8869, + "theoretical_loss": 3.5062211303481616, + "tokens_seen": 1548693504 + }, + { + "epoch": 4.06, + "learning_rate": 0.000268074222668004, + "loss": 2.8831, + "theoretical_loss": 3.506208128799286, + "tokens_seen": 1548759040 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026806419257773315, + "loss": 2.8184, + "theoretical_loss": 3.5061951279545998, + "tokens_seen": 1548824576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002680541624874624, + "loss": 2.8735, + "theoretical_loss": 3.506182127814035, + "tokens_seen": 1548890112 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026804413239719157, + "loss": 2.8772, + "theoretical_loss": 3.5061691283775236, + "tokens_seen": 1548955648 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026803410230692075, + "loss": 2.8047, + "theoretical_loss": 3.5061561296449972, + "tokens_seen": 1549021184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026802407221664993, + "loss": 2.7574, + "theoretical_loss": 3.5061431316163887, + "tokens_seen": 1549086720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026801404212637917, + "loss": 2.7813, + "theoretical_loss": 3.50613013429163, + "tokens_seen": 1549152256 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026800401203610835, + "loss": 2.7512, + "theoretical_loss": 3.506117137670653, + "tokens_seen": 1549217792 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026799398194583753, + "loss": 2.7639, + "theoretical_loss": 3.506104141753389, + "tokens_seen": 1549283328 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679839518555667, + "loss": 2.8592, + "theoretical_loss": 3.506091146539772, + "tokens_seen": 1549348864 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679739217652959, + "loss": 2.9203, + "theoretical_loss": 3.5060781520297333, + "tokens_seen": 1549414400 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679638916750251, + "loss": 2.8497, + "theoretical_loss": 3.5060651582232047, + "tokens_seen": 1549479936 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026795386158475425, + "loss": 2.8395, + "theoretical_loss": 3.506052165120118, + "tokens_seen": 1549545472 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679438314944835, + "loss": 2.8532, + "theoretical_loss": 3.5060391727204063, + "tokens_seen": 1549611008 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679338014042126, + "loss": 2.7856, + "theoretical_loss": 3.5060261810240014, + "tokens_seen": 1549676544 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026792377131394185, + "loss": 2.8613, + "theoretical_loss": 3.506013190030836, + "tokens_seen": 1549742080 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026791374122367103, + "loss": 2.7111, + "theoretical_loss": 3.506000199740841, + "tokens_seen": 1549807616 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2474176, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8033578395843506, + "objective/train/theoretical_loss": 3.5059872101539495, + "objective/train/tokens_used": 1570333152, + "theoretical_loss": 3.5059872101539495, + "tokens_seen": 1549873152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002679037111334002, + "loss": 2.8828, + "theoretical_loss": 3.5059872101539495, + "tokens_seen": 1549873152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002678936810431294, + "loss": 2.8513, + "theoretical_loss": 3.505974221270094, + "tokens_seen": 1549938688 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026788365095285863, + "loss": 2.804, + "theoretical_loss": 3.5059612330892067, + "tokens_seen": 1550004224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026787362086258776, + "loss": 2.8358, + "theoretical_loss": 3.5059482456112185, + "tokens_seen": 1550069760 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267863590772317, + "loss": 2.7617, + "theoretical_loss": 3.5059352588360637, + "tokens_seen": 1550135296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002678535606820461, + "loss": 2.8791, + "theoretical_loss": 3.5059222727636725, + "tokens_seen": 1550200832 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026784353059177535, + "loss": 2.8052, + "theoretical_loss": 3.505909287393979, + "tokens_seen": 1550266368 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026783350050150453, + "loss": 2.7734, + "theoretical_loss": 3.5058963027269137, + "tokens_seen": 1550331904 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002678234704112337, + "loss": 2.7827, + "theoretical_loss": 3.5058833187624105, + "tokens_seen": 1550397440 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002678134403209629, + "loss": 2.7006, + "theoretical_loss": 3.5058703355004006, + "tokens_seen": 1550462976 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002678034102306921, + "loss": 2.7879, + "theoretical_loss": 3.505857352940817, + "tokens_seen": 1550528512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026779338014042126, + "loss": 2.8122, + "theoretical_loss": 3.5058443710835916, + "tokens_seen": 1550594048 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002677833500501505, + "loss": 2.7948, + "theoretical_loss": 3.505831389928657, + "tokens_seen": 1550659584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002677733199598796, + "loss": 2.8378, + "theoretical_loss": 3.5058184094759453, + "tokens_seen": 1550725120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026776328986960886, + "loss": 2.8741, + "theoretical_loss": 3.5058054297253882, + "tokens_seen": 1550790656 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267753259779338, + "loss": 2.7501, + "theoretical_loss": 3.5057924506769194, + "tokens_seen": 1550856192 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002677432296890672, + "loss": 2.8834, + "theoretical_loss": 3.5057794723304703, + "tokens_seen": 1550921728 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002677331995987964, + "loss": 2.8218, + "theoretical_loss": 3.505766494685974, + "tokens_seen": 1550987264 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002677231695085256, + "loss": 2.7664, + "theoretical_loss": 3.5057535177433623, + "tokens_seen": 1551052800 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026771313941825476, + "loss": 2.8801, + "theoretical_loss": 3.5057405415025675, + "tokens_seen": 1551118336 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267703109327984, + "loss": 2.9702, + "theoretical_loss": 3.505727565963522, + "tokens_seen": 1551183872 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002676930792377131, + "loss": 2.9023, + "theoretical_loss": 3.505714591126159, + "tokens_seen": 1551249408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026768304914744236, + "loss": 2.823, + "theoretical_loss": 3.50570161699041, + "tokens_seen": 1551314944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002676730190571715, + "loss": 2.7598, + "theoretical_loss": 3.5056886435562076, + "tokens_seen": 1551380480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002676629889669007, + "loss": 2.7639, + "theoretical_loss": 3.505675670823485, + "tokens_seen": 1551446016 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2475586, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.711008310317993, + "objective/train/theoretical_loss": 3.5056626987921735, + "objective/train/tokens_used": 1571971552, + "theoretical_loss": 3.5056626987921735, + "tokens_seen": 1551511552 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002676529588766299, + "loss": 2.8031, + "theoretical_loss": 3.5056626987921735, + "tokens_seen": 1551511552 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002676429287863591, + "loss": 2.727, + "theoretical_loss": 3.505649727462206, + "tokens_seen": 1551577088 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026763289869608827, + "loss": 2.8876, + "theoretical_loss": 3.505636756833515, + "tokens_seen": 1551642624 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026762286860581745, + "loss": 2.7757, + "theoretical_loss": 3.5056237869060336, + "tokens_seen": 1551708160 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026761283851554663, + "loss": 2.8952, + "theoretical_loss": 3.505610817679693, + "tokens_seen": 1551773696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026760280842527586, + "loss": 2.8053, + "theoretical_loss": 3.5055978491544266, + "tokens_seen": 1551839232 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267592778335005, + "loss": 2.7774, + "theoretical_loss": 3.505584881330167, + "tokens_seen": 1551904768 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002675827482447342, + "loss": 2.7643, + "theoretical_loss": 3.505571914206846, + "tokens_seen": 1551970304 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026757271815446335, + "loss": 2.7837, + "theoretical_loss": 3.505558947784397, + "tokens_seen": 1552035840 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002675626880641926, + "loss": 2.788, + "theoretical_loss": 3.5055459820627517, + "tokens_seen": 1552101376 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026755265797392177, + "loss": 2.7906, + "theoretical_loss": 3.505533017041843, + "tokens_seen": 1552166912 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026754262788365095, + "loss": 2.6598, + "theoretical_loss": 3.505520052721603, + "tokens_seen": 1552232448 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026753259779338013, + "loss": 2.8811, + "theoretical_loss": 3.505507089101965, + "tokens_seen": 1552297984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026752256770310937, + "loss": 2.8072, + "theoretical_loss": 3.505494126182861, + "tokens_seen": 1552363520 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002675125376128385, + "loss": 2.78, + "theoretical_loss": 3.505481163964224, + "tokens_seen": 1552429056 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026750250752256773, + "loss": 2.8672, + "theoretical_loss": 3.5054682024459867, + "tokens_seen": 1552494592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026749247743229686, + "loss": 2.6471, + "theoretical_loss": 3.5054552416280806, + "tokens_seen": 1552560128 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002674824473420261, + "loss": 2.8428, + "theoretical_loss": 3.50544228151044, + "tokens_seen": 1552625664 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026747241725175527, + "loss": 2.818, + "theoretical_loss": 3.5054293220929957, + "tokens_seen": 1552691200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026746238716148445, + "loss": 2.8744, + "theoretical_loss": 3.505416363375682, + "tokens_seen": 1552756736 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026745235707121363, + "loss": 2.7031, + "theoretical_loss": 3.5054034053584298, + "tokens_seen": 1552822272 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002674423269809428, + "loss": 2.7383, + "theoretical_loss": 3.5053904480411733, + "tokens_seen": 1552887808 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267432296890672, + "loss": 2.9078, + "theoretical_loss": 3.505377491423844, + "tokens_seen": 1552953344 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026742226680040123, + "loss": 2.7138, + "theoretical_loss": 3.5053645355063754, + "tokens_seen": 1553018880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026741223671013036, + "loss": 2.6776, + "theoretical_loss": 3.5053515802887, + "tokens_seen": 1553084416 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2478252, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.023747682571411, + "objective/train/theoretical_loss": 3.50533862577075, + "objective/train/tokens_used": 1573609952, + "theoretical_loss": 3.50533862577075, + "tokens_seen": 1553149952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002674022066198596, + "loss": 2.905, + "theoretical_loss": 3.50533862577075, + "tokens_seen": 1553149952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002673921765295888, + "loss": 2.8004, + "theoretical_loss": 3.5053256719524586, + "tokens_seen": 1553215488 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026738214643931796, + "loss": 2.8537, + "theoretical_loss": 3.5053127188337583, + "tokens_seen": 1553281024 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026737211634904714, + "loss": 2.7875, + "theoretical_loss": 3.5052997664145815, + "tokens_seen": 1553346560 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002673620862587763, + "loss": 2.9104, + "theoretical_loss": 3.5052868146948613, + "tokens_seen": 1553412096 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002673520561685055, + "loss": 2.8377, + "theoretical_loss": 3.50527386367453, + "tokens_seen": 1553477632 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026734202607823474, + "loss": 2.8266, + "theoretical_loss": 3.5052609133535215, + "tokens_seen": 1553543168 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026733199598796386, + "loss": 2.783, + "theoretical_loss": 3.505247963731767, + "tokens_seen": 1553608704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002673219658976931, + "loss": 2.6858, + "theoretical_loss": 3.5052350148092, + "tokens_seen": 1553674240 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002673119358074222, + "loss": 2.743, + "theoretical_loss": 3.5052220665857536, + "tokens_seen": 1553739776 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026730190571715146, + "loss": 2.7721, + "theoretical_loss": 3.5052091190613597, + "tokens_seen": 1553805312 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026729187562688064, + "loss": 2.7855, + "theoretical_loss": 3.5051961722359515, + "tokens_seen": 1553870848 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672818455366098, + "loss": 2.6493, + "theoretical_loss": 3.505183226109462, + "tokens_seen": 1553936384 + }, + { + "epoch": 4.06, + "learning_rate": 0.000267271815446339, + "loss": 2.7262, + "theoretical_loss": 3.505170280681824, + "tokens_seen": 1554001920 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672617853560682, + "loss": 2.7852, + "theoretical_loss": 3.50515733595297, + "tokens_seen": 1554067456 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672517552657974, + "loss": 2.848, + "theoretical_loss": 3.5051443919228324, + "tokens_seen": 1554132992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672417251755266, + "loss": 2.932, + "theoretical_loss": 3.5051314485913454, + "tokens_seen": 1554198528 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672316950852558, + "loss": 2.6794, + "theoretical_loss": 3.505118505958441, + "tokens_seen": 1554264064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026722166499498496, + "loss": 2.7767, + "theoretical_loss": 3.505105564024051, + "tokens_seen": 1554329600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672116349047142, + "loss": 2.765, + "theoretical_loss": 3.5050926227881103, + "tokens_seen": 1554395136 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002672016048144433, + "loss": 2.8238, + "theoretical_loss": 3.5050796822505506, + "tokens_seen": 1554460672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026719157472417256, + "loss": 2.824, + "theoretical_loss": 3.5050667424113042, + "tokens_seen": 1554526208 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002671815446339017, + "loss": 2.7956, + "theoretical_loss": 3.5050538032703056, + "tokens_seen": 1554591744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002671715145436309, + "loss": 2.9282, + "theoretical_loss": 3.5050408648274862, + "tokens_seen": 1554657280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002671614844533601, + "loss": 2.5547, + "theoretical_loss": 3.50502792708278, + "tokens_seen": 1554722816 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2481099, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.630131244659424, + "objective/train/theoretical_loss": 3.505014990036119, + "objective/train/tokens_used": 1575248352, + "theoretical_loss": 3.505014990036119, + "tokens_seen": 1554788352 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002671514543630893, + "loss": 2.7846, + "theoretical_loss": 3.505014990036119, + "tokens_seen": 1554788352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026714142427281847, + "loss": 2.88, + "theoretical_loss": 3.5050020536874364, + "tokens_seen": 1554853888 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026713139418254765, + "loss": 2.838, + "theoretical_loss": 3.504989118036666, + "tokens_seen": 1554919424 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026712136409227683, + "loss": 2.7021, + "theoretical_loss": 3.5049761830837394, + "tokens_seen": 1554984960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026711133400200606, + "loss": 2.7428, + "theoretical_loss": 3.50496324882859, + "tokens_seen": 1555050496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002671013039117352, + "loss": 2.6559, + "theoretical_loss": 3.5049503152711505, + "tokens_seen": 1555116032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002670912738214644, + "loss": 2.9355, + "theoretical_loss": 3.504937382411355, + "tokens_seen": 1555181568 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026708124373119355, + "loss": 2.7251, + "theoretical_loss": 3.5049244502491352, + "tokens_seen": 1555247104 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002670712136409228, + "loss": 2.903, + "theoretical_loss": 3.504911518784425, + "tokens_seen": 1555312640 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026706118355065197, + "loss": 2.8997, + "theoretical_loss": 3.5048985880171566, + "tokens_seen": 1555378176 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026705115346038115, + "loss": 2.7384, + "theoretical_loss": 3.5048856579472636, + "tokens_seen": 1555443712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026704112337011033, + "loss": 2.8101, + "theoretical_loss": 3.5048727285746786, + "tokens_seen": 1555509248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026703109327983957, + "loss": 2.734, + "theoretical_loss": 3.504859799899335, + "tokens_seen": 1555574784 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002670210631895687, + "loss": 2.8363, + "theoretical_loss": 3.5048468719211656, + "tokens_seen": 1555640320 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026701103309929793, + "loss": 2.8481, + "theoretical_loss": 3.5048339446401036, + "tokens_seen": 1555705856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026700100300902706, + "loss": 2.8515, + "theoretical_loss": 3.5048210180560817, + "tokens_seen": 1555771392 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002669909729187563, + "loss": 2.858, + "theoretical_loss": 3.5048080921690334, + "tokens_seen": 1555836928 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026698094282848547, + "loss": 2.8183, + "theoretical_loss": 3.5047951669788913, + "tokens_seen": 1555902464 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026697091273821465, + "loss": 2.804, + "theoretical_loss": 3.5047822424855886, + "tokens_seen": 1555968000 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026696088264794383, + "loss": 2.8427, + "theoretical_loss": 3.5047693186890587, + "tokens_seen": 1556033536 + }, + { + "epoch": 4.06, + "learning_rate": 0.000266950852557673, + "loss": 2.7341, + "theoretical_loss": 3.5047563955892347, + "tokens_seen": 1556099072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002669408224674022, + "loss": 2.761, + "theoretical_loss": 3.504743473186049, + "tokens_seen": 1556164608 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026693079237713143, + "loss": 2.8478, + "theoretical_loss": 3.504730551479436, + "tokens_seen": 1556230144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026692076228686056, + "loss": 2.8597, + "theoretical_loss": 3.504717630469327, + "tokens_seen": 1556295680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002669107321965898, + "loss": 2.7882, + "theoretical_loss": 3.5047047101556563, + "tokens_seen": 1556361216 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2483723, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.927309036254883, + "objective/train/theoretical_loss": 3.5046917905383577, + "objective/train/tokens_used": 1576886752, + "theoretical_loss": 3.5046917905383577, + "tokens_seen": 1556426752 + }, + { + "epoch": 4.06, + "learning_rate": 0.000266900702106319, + "loss": 2.8437, + "theoretical_loss": 3.5046917905383577, + "tokens_seen": 1556426752 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026689067201604816, + "loss": 2.8943, + "theoretical_loss": 3.5046788716173625, + "tokens_seen": 1556492288 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026688064192577734, + "loss": 2.7648, + "theoretical_loss": 3.504665953392606, + "tokens_seen": 1556557824 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002668706118355065, + "loss": 2.8493, + "theoretical_loss": 3.504653035864019, + "tokens_seen": 1556623360 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002668605817452357, + "loss": 2.6663, + "theoretical_loss": 3.504640119031537, + "tokens_seen": 1556688896 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026685055165496494, + "loss": 2.7211, + "theoretical_loss": 3.5046272028950916, + "tokens_seen": 1556754432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026684052156469406, + "loss": 2.8655, + "theoretical_loss": 3.5046142874546167, + "tokens_seen": 1556819968 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002668304914744233, + "loss": 2.7363, + "theoretical_loss": 3.504601372710045, + "tokens_seen": 1556885504 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002668204613841524, + "loss": 2.8811, + "theoretical_loss": 3.5045884586613103, + "tokens_seen": 1556951040 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026681043129388166, + "loss": 2.8419, + "theoretical_loss": 3.5045755453083456, + "tokens_seen": 1557016576 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026680040120361084, + "loss": 2.8847, + "theoretical_loss": 3.504562632651084, + "tokens_seen": 1557082112 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026679037111334, + "loss": 2.831, + "theoretical_loss": 3.5045497206894582, + "tokens_seen": 1557147648 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002667803410230692, + "loss": 2.7616, + "theoretical_loss": 3.504536809423403, + "tokens_seen": 1557213184 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002667703109327984, + "loss": 2.7177, + "theoretical_loss": 3.5045238988528506, + "tokens_seen": 1557278720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026676028084252757, + "loss": 2.7002, + "theoretical_loss": 3.504510988977734, + "tokens_seen": 1557344256 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002667502507522568, + "loss": 2.9003, + "theoretical_loss": 3.504498079797987, + "tokens_seen": 1557409792 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026674022066198593, + "loss": 2.9134, + "theoretical_loss": 3.5044851713135428, + "tokens_seen": 1557475328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026673019057171516, + "loss": 2.9329, + "theoretical_loss": 3.5044722635243346, + "tokens_seen": 1557540864 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026672016048144434, + "loss": 2.9254, + "theoretical_loss": 3.5044593564302957, + "tokens_seen": 1557606400 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002667101303911735, + "loss": 2.7668, + "theoretical_loss": 3.5044464500313595, + "tokens_seen": 1557671936 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002667001003009027, + "loss": 2.8327, + "theoretical_loss": 3.5044335443274597, + "tokens_seen": 1557737472 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002666900702106319, + "loss": 2.8465, + "theoretical_loss": 3.504420639318529, + "tokens_seen": 1557803008 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026668004012036107, + "loss": 2.859, + "theoretical_loss": 3.5044077350045004, + "tokens_seen": 1557868544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002666700100300903, + "loss": 2.7309, + "theoretical_loss": 3.5043948313853086, + "tokens_seen": 1557934080 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026665997993981943, + "loss": 2.7312, + "theoretical_loss": 3.504381928460886, + "tokens_seen": 1557999616 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2486426, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.767914056777954, + "objective/train/theoretical_loss": 3.504369026231166, + "objective/train/tokens_used": 1578525152, + "theoretical_loss": 3.504369026231166, + "tokens_seen": 1558065152 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026664994984954867, + "loss": 2.9135, + "theoretical_loss": 3.504369026231166, + "tokens_seen": 1558065152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002666399197592778, + "loss": 2.8491, + "theoretical_loss": 3.5043561246960824, + "tokens_seen": 1558130688 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026662988966900703, + "loss": 2.7087, + "theoretical_loss": 3.504343223855568, + "tokens_seen": 1558196224 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002666198595787362, + "loss": 2.8948, + "theoretical_loss": 3.5043303237095564, + "tokens_seen": 1558261760 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002666098294884654, + "loss": 2.7918, + "theoretical_loss": 3.504317424257981, + "tokens_seen": 1558327296 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026659979939819457, + "loss": 2.7129, + "theoretical_loss": 3.5043045255007756, + "tokens_seen": 1558392832 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026658976930792375, + "loss": 2.812, + "theoretical_loss": 3.5042916274378735, + "tokens_seen": 1558458368 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026657973921765293, + "loss": 3.0078, + "theoretical_loss": 3.504278730069208, + "tokens_seen": 1558523904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026656970912738217, + "loss": 2.704, + "theoretical_loss": 3.504265833394712, + "tokens_seen": 1558589440 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002665596790371113, + "loss": 2.7988, + "theoretical_loss": 3.50425293741432, + "tokens_seen": 1558654976 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026654964894684053, + "loss": 2.7906, + "theoretical_loss": 3.504240042127965, + "tokens_seen": 1558720512 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002665396188565697, + "loss": 2.7583, + "theoretical_loss": 3.50422714753558, + "tokens_seen": 1558786048 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002665295887662989, + "loss": 2.7898, + "theoretical_loss": 3.504214253637099, + "tokens_seen": 1558851584 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002665195586760281, + "loss": 2.8098, + "theoretical_loss": 3.5042013604324556, + "tokens_seen": 1558917120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026650952858575726, + "loss": 2.9647, + "theoretical_loss": 3.5041884679215833, + "tokens_seen": 1558982656 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002664994984954865, + "loss": 2.9315, + "theoretical_loss": 3.504175576104415, + "tokens_seen": 1559048192 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026648946840521567, + "loss": 2.9295, + "theoretical_loss": 3.5041626849808845, + "tokens_seen": 1559113728 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026647943831494485, + "loss": 2.9545, + "theoretical_loss": 3.5041497945509255, + "tokens_seen": 1559179264 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026646940822467403, + "loss": 2.7378, + "theoretical_loss": 3.5041369048144713, + "tokens_seen": 1559244800 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002664593781344032, + "loss": 2.7521, + "theoretical_loss": 3.504124015771456, + "tokens_seen": 1559310336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002664493480441324, + "loss": 2.9103, + "theoretical_loss": 3.5041111274218126, + "tokens_seen": 1559375872 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026643931795386163, + "loss": 2.785, + "theoretical_loss": 3.504098239765475, + "tokens_seen": 1559441408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026642928786359076, + "loss": 2.7795, + "theoretical_loss": 3.504085352802376, + "tokens_seen": 1559506944 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026641925777332, + "loss": 2.8593, + "theoretical_loss": 3.50407246653245, + "tokens_seen": 1559572480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002664092276830492, + "loss": 2.9198, + "theoretical_loss": 3.504059580955631, + "tokens_seen": 1559638016 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2489031, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9564785957336426, + "objective/train/theoretical_loss": 3.5040466960718515, + "objective/train/tokens_used": 1580163552, + "theoretical_loss": 3.5040466960718515, + "tokens_seen": 1559703552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026639919759277836, + "loss": 2.9246, + "theoretical_loss": 3.5040466960718515, + "tokens_seen": 1559703552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026638916750250754, + "loss": 2.8616, + "theoretical_loss": 3.5040338118810452, + "tokens_seen": 1559769088 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002663791374122367, + "loss": 2.8051, + "theoretical_loss": 3.5040209283831465, + "tokens_seen": 1559834624 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002663691073219659, + "loss": 2.9104, + "theoretical_loss": 3.5040080455780886, + "tokens_seen": 1559900160 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026635907723169514, + "loss": 2.7256, + "theoretical_loss": 3.503995163465805, + "tokens_seen": 1559965696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026634904714142426, + "loss": 2.8656, + "theoretical_loss": 3.5039822820462296, + "tokens_seen": 1560031232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002663390170511535, + "loss": 2.7348, + "theoretical_loss": 3.5039694013192957, + "tokens_seen": 1560096768 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002663289869608826, + "loss": 2.816, + "theoretical_loss": 3.5039565212849375, + "tokens_seen": 1560162304 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026631895687061186, + "loss": 2.8679, + "theoretical_loss": 3.503943641943088, + "tokens_seen": 1560227840 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026630892678034104, + "loss": 2.63, + "theoretical_loss": 3.5039307632936816, + "tokens_seen": 1560293376 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002662988966900702, + "loss": 2.8632, + "theoretical_loss": 3.5039178853366515, + "tokens_seen": 1560358912 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002662888665997994, + "loss": 2.823, + "theoretical_loss": 3.5039050080719316, + "tokens_seen": 1560424448 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002662788365095286, + "loss": 2.8066, + "theoretical_loss": 3.5038921314994553, + "tokens_seen": 1560489984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026626880641925777, + "loss": 2.7267, + "theoretical_loss": 3.503879255619157, + "tokens_seen": 1560555520 + }, + { + "epoch": 4.06, + "learning_rate": 0.000266258776328987, + "loss": 2.8704, + "theoretical_loss": 3.5038663804309698, + "tokens_seen": 1560621056 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026624874623871613, + "loss": 2.52, + "theoretical_loss": 3.5038535059348277, + "tokens_seen": 1560686592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026623871614844536, + "loss": 2.6858, + "theoretical_loss": 3.503840632130664, + "tokens_seen": 1560752128 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026622868605817454, + "loss": 2.8871, + "theoretical_loss": 3.5038277590184133, + "tokens_seen": 1560817664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002662186559679037, + "loss": 2.8483, + "theoretical_loss": 3.5038148865980085, + "tokens_seen": 1560883200 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002662086258776329, + "loss": 2.7948, + "theoretical_loss": 3.5038020148693843, + "tokens_seen": 1560948736 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002661985957873621, + "loss": 2.6822, + "theoretical_loss": 3.5037891438324733, + "tokens_seen": 1561014272 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026618856569709127, + "loss": 2.8153, + "theoretical_loss": 3.50377627348721, + "tokens_seen": 1561079808 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002661785356068205, + "loss": 2.9018, + "theoretical_loss": 3.5037634038335286, + "tokens_seen": 1561145344 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026616850551654963, + "loss": 2.8238, + "theoretical_loss": 3.503750534871362, + "tokens_seen": 1561210880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026615847542627887, + "loss": 2.8908, + "theoretical_loss": 3.503737666600645, + "tokens_seen": 1561276416 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2490587, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.90627384185791, + "objective/train/theoretical_loss": 3.5037247990213105, + "objective/train/tokens_used": 1581801952, + "theoretical_loss": 3.5037247990213105, + "tokens_seen": 1561341952 + }, + { + "epoch": 4.06, + "learning_rate": 0.000266148445336008, + "loss": 2.6666, + "theoretical_loss": 3.5037247990213105, + "tokens_seen": 1561341952 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026613841524573723, + "loss": 3.0193, + "theoretical_loss": 3.5037119321332924, + "tokens_seen": 1561407488 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002661283851554664, + "loss": 2.8514, + "theoretical_loss": 3.5036990659365252, + "tokens_seen": 1561473024 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002661183550651956, + "loss": 2.919, + "theoretical_loss": 3.5036862004309426, + "tokens_seen": 1561538560 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026610832497492477, + "loss": 2.7682, + "theoretical_loss": 3.5036733356164778, + "tokens_seen": 1561604096 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026609829488465395, + "loss": 2.6643, + "theoretical_loss": 3.503660471493066, + "tokens_seen": 1561669632 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026608826479438313, + "loss": 2.758, + "theoretical_loss": 3.503647608060639, + "tokens_seen": 1561735168 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026607823470411237, + "loss": 2.7961, + "theoretical_loss": 3.503634745319133, + "tokens_seen": 1561800704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002660682046138415, + "loss": 2.776, + "theoretical_loss": 3.5036218832684805, + "tokens_seen": 1561866240 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026605817452357073, + "loss": 2.7044, + "theoretical_loss": 3.5036090219086153, + "tokens_seen": 1561931776 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002660481444332999, + "loss": 2.8029, + "theoretical_loss": 3.5035961612394724, + "tokens_seen": 1561997312 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002660381143430291, + "loss": 2.8089, + "theoretical_loss": 3.503583301260985, + "tokens_seen": 1562062848 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002660280842527583, + "loss": 2.6794, + "theoretical_loss": 3.5035704419730864, + "tokens_seen": 1562128384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026601805416248746, + "loss": 2.8475, + "theoretical_loss": 3.503557583375712, + "tokens_seen": 1562193920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026600802407221664, + "loss": 2.7447, + "theoretical_loss": 3.5035447254687946, + "tokens_seen": 1562259456 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026599799398194587, + "loss": 2.8185, + "theoretical_loss": 3.503531868252269, + "tokens_seen": 1562324992 + }, + { + "epoch": 4.06, + "learning_rate": 0.000265987963891675, + "loss": 2.7744, + "theoretical_loss": 3.503519011726068, + "tokens_seen": 1562390528 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026597793380140423, + "loss": 2.7688, + "theoretical_loss": 3.5035061558901273, + "tokens_seen": 1562456064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026596790371113336, + "loss": 2.8012, + "theoretical_loss": 3.5034933007443794, + "tokens_seen": 1562521600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002659578736208626, + "loss": 2.8439, + "theoretical_loss": 3.5034804462887585, + "tokens_seen": 1562587136 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002659478435305918, + "loss": 2.7144, + "theoretical_loss": 3.503467592523199, + "tokens_seen": 1562652672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026593781344032096, + "loss": 2.7622, + "theoretical_loss": 3.503454739447635, + "tokens_seen": 1562718208 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026592778335005014, + "loss": 2.8561, + "theoretical_loss": 3.503441887062001, + "tokens_seen": 1562783744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002659177532597794, + "loss": 2.6381, + "theoretical_loss": 3.5034290353662296, + "tokens_seen": 1562849280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002659077231695085, + "loss": 2.6015, + "theoretical_loss": 3.5034161843602565, + "tokens_seen": 1562914816 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2493331, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.469290256500244, + "objective/train/theoretical_loss": 3.5034033340440143, + "objective/train/tokens_used": 1583440352, + "theoretical_loss": 3.5034033340440143, + "tokens_seen": 1562980352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026589769307923774, + "loss": 2.6803, + "theoretical_loss": 3.5034033340440143, + "tokens_seen": 1562980352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026588766298896686, + "loss": 2.8071, + "theoretical_loss": 3.5033904844174373, + "tokens_seen": 1563045888 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002658776328986961, + "loss": 2.7346, + "theoretical_loss": 3.5033776354804607, + "tokens_seen": 1563111424 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002658676028084253, + "loss": 2.8324, + "theoretical_loss": 3.503364787233017, + "tokens_seen": 1563176960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026585757271815446, + "loss": 2.7347, + "theoretical_loss": 3.503351939675042, + "tokens_seen": 1563242496 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026584754262788364, + "loss": 2.9472, + "theoretical_loss": 3.5033390928064683, + "tokens_seen": 1563308032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002658375125376128, + "loss": 2.701, + "theoretical_loss": 3.5033262466272306, + "tokens_seen": 1563373568 + }, + { + "epoch": 4.06, + "learning_rate": 0.000265827482447342, + "loss": 2.7461, + "theoretical_loss": 3.5033134011372633, + "tokens_seen": 1563439104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026581745235707124, + "loss": 2.8262, + "theoretical_loss": 3.5033005563365003, + "tokens_seen": 1563504640 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026580742226680037, + "loss": 2.6862, + "theoretical_loss": 3.5032877122248753, + "tokens_seen": 1563570176 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002657973921765296, + "loss": 2.6213, + "theoretical_loss": 3.5032748688023236, + "tokens_seen": 1563635712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026578736208625873, + "loss": 2.813, + "theoretical_loss": 3.5032620260687777, + "tokens_seen": 1563701248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026577733199598797, + "loss": 2.8479, + "theoretical_loss": 3.503249184024173, + "tokens_seen": 1563766784 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026576730190571715, + "loss": 2.7879, + "theoretical_loss": 3.5032363426684436, + "tokens_seen": 1563832320 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026575727181544633, + "loss": 2.7598, + "theoretical_loss": 3.503223502001523, + "tokens_seen": 1563897856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026574724172517556, + "loss": 2.8663, + "theoretical_loss": 3.5032106620233465, + "tokens_seen": 1563963392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026573721163490474, + "loss": 2.7576, + "theoretical_loss": 3.5031978227338465, + "tokens_seen": 1564028928 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002657271815446339, + "loss": 2.8431, + "theoretical_loss": 3.503184984132959, + "tokens_seen": 1564094464 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002657171514543631, + "loss": 2.9785, + "theoretical_loss": 3.5031721462206176, + "tokens_seen": 1564160000 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002657071213640923, + "loss": 2.903, + "theoretical_loss": 3.5031593089967563, + "tokens_seen": 1564225536 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026569709127382147, + "loss": 2.9382, + "theoretical_loss": 3.5031464724613093, + "tokens_seen": 1564291072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002656870611835507, + "loss": 2.7103, + "theoretical_loss": 3.5031336366142116, + "tokens_seen": 1564356608 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026567703109327983, + "loss": 2.688, + "theoretical_loss": 3.5031208014553963, + "tokens_seen": 1564422144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026566700100300907, + "loss": 2.7613, + "theoretical_loss": 3.5031079669847984, + "tokens_seen": 1564487680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002656569709127382, + "loss": 2.7297, + "theoretical_loss": 3.5030951332023523, + "tokens_seen": 1564553216 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2496156, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.611402750015259, + "objective/train/theoretical_loss": 3.503082300107992, + "objective/train/tokens_used": 1585078752, + "theoretical_loss": 3.503082300107992, + "tokens_seen": 1564618752 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026564694082246743, + "loss": 2.7854, + "theoretical_loss": 3.503082300107992, + "tokens_seen": 1564618752 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002656369107321966, + "loss": 2.744, + "theoretical_loss": 3.5030694677016507, + "tokens_seen": 1564684288 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002656268806419258, + "loss": 2.7545, + "theoretical_loss": 3.503056635983265, + "tokens_seen": 1564749824 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026561685055165497, + "loss": 2.85, + "theoretical_loss": 3.5030438049527675, + "tokens_seen": 1564815360 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026560682046138415, + "loss": 2.784, + "theoretical_loss": 3.503030974610093, + "tokens_seen": 1564880896 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026559679037111333, + "loss": 2.8449, + "theoretical_loss": 3.503018144955176, + "tokens_seen": 1564946432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026558676028084257, + "loss": 2.8283, + "theoretical_loss": 3.5030053159879504, + "tokens_seen": 1565011968 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002655767301905717, + "loss": 2.9104, + "theoretical_loss": 3.502992487708351, + "tokens_seen": 1565077504 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026556670010030093, + "loss": 2.7909, + "theoretical_loss": 3.502979660116312, + "tokens_seen": 1565143040 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002655566700100301, + "loss": 2.8559, + "theoretical_loss": 3.502966833211768, + "tokens_seen": 1565208576 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002655466399197593, + "loss": 2.7832, + "theoretical_loss": 3.5029540069946528, + "tokens_seen": 1565274112 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002655366098294885, + "loss": 2.8464, + "theoretical_loss": 3.5029411814649007, + "tokens_seen": 1565339648 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026552657973921766, + "loss": 2.813, + "theoretical_loss": 3.502928356622447, + "tokens_seen": 1565405184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026551654964894684, + "loss": 2.6957, + "theoretical_loss": 3.502915532467225, + "tokens_seen": 1565470720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026550651955867607, + "loss": 2.8233, + "theoretical_loss": 3.5029027089991702, + "tokens_seen": 1565536256 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002654964894684052, + "loss": 2.8449, + "theoretical_loss": 3.502889886218216, + "tokens_seen": 1565601792 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026548645937813443, + "loss": 2.792, + "theoretical_loss": 3.5028770641242977, + "tokens_seen": 1565667328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026547642928786356, + "loss": 2.8836, + "theoretical_loss": 3.502864242717349, + "tokens_seen": 1565732864 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002654663991975928, + "loss": 2.8182, + "theoretical_loss": 3.5028514219973053, + "tokens_seen": 1565798400 + }, + { + "epoch": 4.06, + "learning_rate": 0.000265456369107322, + "loss": 2.7295, + "theoretical_loss": 3.5028386019640996, + "tokens_seen": 1565863936 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026544633901705116, + "loss": 2.8656, + "theoretical_loss": 3.502825782617668, + "tokens_seen": 1565929472 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026543630892678034, + "loss": 2.8152, + "theoretical_loss": 3.5028129639579433, + "tokens_seen": 1565995008 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002654262788365096, + "loss": 2.7697, + "theoretical_loss": 3.5028001459848617, + "tokens_seen": 1566060544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002654162487462387, + "loss": 2.8124, + "theoretical_loss": 3.502787328698356, + "tokens_seen": 1566126080 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026540621865596794, + "loss": 2.7841, + "theoretical_loss": 3.5027745120983624, + "tokens_seen": 1566191616 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2498947, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7916433811187744, + "objective/train/theoretical_loss": 3.5027616961848143, + "objective/train/tokens_used": 1586717152, + "theoretical_loss": 3.5027616961848143, + "tokens_seen": 1566257152 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026539618856569707, + "loss": 2.7941, + "theoretical_loss": 3.5027616961848143, + "tokens_seen": 1566257152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002653861584754263, + "loss": 2.8822, + "theoretical_loss": 3.502748880957646, + "tokens_seen": 1566322688 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002653761283851555, + "loss": 2.8921, + "theoretical_loss": 3.5027360664167926, + "tokens_seen": 1566388224 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026536609829488466, + "loss": 2.7415, + "theoretical_loss": 3.5027232525621885, + "tokens_seen": 1566453760 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026535606820461384, + "loss": 2.6023, + "theoretical_loss": 3.5027104393937685, + "tokens_seen": 1566519296 + }, + { + "epoch": 4.06, + "learning_rate": 0.000265346038114343, + "loss": 2.9043, + "theoretical_loss": 3.5026976269114662, + "tokens_seen": 1566584832 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002653360080240722, + "loss": 2.7857, + "theoretical_loss": 3.5026848151152175, + "tokens_seen": 1566650368 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026532597793380144, + "loss": 2.8314, + "theoretical_loss": 3.5026720040049564, + "tokens_seen": 1566715904 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026531594784353057, + "loss": 2.8005, + "theoretical_loss": 3.502659193580617, + "tokens_seen": 1566781440 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002653059177532598, + "loss": 2.7658, + "theoretical_loss": 3.5026463838421344, + "tokens_seen": 1566846976 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026529588766298893, + "loss": 2.6452, + "theoretical_loss": 3.502633574789443, + "tokens_seen": 1566912512 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026528585757271817, + "loss": 2.7231, + "theoretical_loss": 3.5026207664224778, + "tokens_seen": 1566978048 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026527582748244735, + "loss": 2.8669, + "theoretical_loss": 3.502607958741173, + "tokens_seen": 1567043584 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026526579739217653, + "loss": 2.8977, + "theoretical_loss": 3.5025951517454628, + "tokens_seen": 1567109120 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002652557673019057, + "loss": 2.7166, + "theoretical_loss": 3.502582345435283, + "tokens_seen": 1567174656 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026524573721163494, + "loss": 2.9381, + "theoretical_loss": 3.502569539810567, + "tokens_seen": 1567240192 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026523570712136407, + "loss": 2.9562, + "theoretical_loss": 3.5025567348712507, + "tokens_seen": 1567305728 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002652256770310933, + "loss": 2.8455, + "theoretical_loss": 3.502543930617268, + "tokens_seen": 1567371264 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026521564694082243, + "loss": 2.7942, + "theoretical_loss": 3.5025311270485533, + "tokens_seen": 1567436800 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026520561685055167, + "loss": 2.8319, + "theoretical_loss": 3.5025183241650417, + "tokens_seen": 1567502336 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026519558676028085, + "loss": 2.8661, + "theoretical_loss": 3.5025055219666674, + "tokens_seen": 1567567872 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026518555667001003, + "loss": 2.87, + "theoretical_loss": 3.5024927204533665, + "tokens_seen": 1567633408 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002651755265797392, + "loss": 2.8737, + "theoretical_loss": 3.5024799196250718, + "tokens_seen": 1567698944 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002651654964894684, + "loss": 2.6928, + "theoretical_loss": 3.5024671194817194, + "tokens_seen": 1567764480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002651554663991976, + "loss": 2.7862, + "theoretical_loss": 3.5024543200232436, + "tokens_seen": 1567830016 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2501833, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.971527338027954, + "objective/train/theoretical_loss": 3.5024415212495787, + "objective/train/tokens_used": 1588355552, + "theoretical_loss": 3.5024415212495787, + "tokens_seen": 1567895552 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002651454363089268, + "loss": 2.7743, + "theoretical_loss": 3.5024415212495787, + "tokens_seen": 1567895552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026513540621865594, + "loss": 2.9659, + "theoretical_loss": 3.5024287231606603, + "tokens_seen": 1567961088 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026512537612838517, + "loss": 2.685, + "theoretical_loss": 3.5024159257564222, + "tokens_seen": 1568026624 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002651153460381143, + "loss": 2.8048, + "theoretical_loss": 3.5024031290368, + "tokens_seen": 1568092160 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026510531594784353, + "loss": 2.7217, + "theoretical_loss": 3.5023903330017276, + "tokens_seen": 1568157696 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650952858575727, + "loss": 2.8637, + "theoretical_loss": 3.5023775376511406, + "tokens_seen": 1568223232 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650852557673019, + "loss": 2.901, + "theoretical_loss": 3.5023647429849736, + "tokens_seen": 1568288768 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650752256770311, + "loss": 2.8824, + "theoretical_loss": 3.502351949003161, + "tokens_seen": 1568354304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650651955867603, + "loss": 2.7511, + "theoretical_loss": 3.502339155705638, + "tokens_seen": 1568419840 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026505516549648944, + "loss": 2.7984, + "theoretical_loss": 3.502326363092339, + "tokens_seen": 1568485376 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650451354062187, + "loss": 2.7004, + "theoretical_loss": 3.5023135711631994, + "tokens_seen": 1568550912 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650351053159478, + "loss": 2.6531, + "theoretical_loss": 3.502300779918153, + "tokens_seen": 1568616448 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026502507522567704, + "loss": 2.8374, + "theoretical_loss": 3.5022879893571357, + "tokens_seen": 1568681984 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650150451354062, + "loss": 2.837, + "theoretical_loss": 3.5022751994800823, + "tokens_seen": 1568747520 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002650050150451354, + "loss": 2.7044, + "theoretical_loss": 3.502262410286927, + "tokens_seen": 1568813056 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026499498495486464, + "loss": 2.8125, + "theoretical_loss": 3.5022496217776053, + "tokens_seen": 1568878592 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026498495486459376, + "loss": 2.9122, + "theoretical_loss": 3.5022368339520513, + "tokens_seen": 1568944128 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264974924774323, + "loss": 2.7908, + "theoretical_loss": 3.502224046810201, + "tokens_seen": 1569009664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002649648946840522, + "loss": 2.8659, + "theoretical_loss": 3.502211260351988, + "tokens_seen": 1569075200 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026495486459378136, + "loss": 2.7612, + "theoretical_loss": 3.502198474577348, + "tokens_seen": 1569140736 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026494483450351054, + "loss": 2.4859, + "theoretical_loss": 3.5021856894862164, + "tokens_seen": 1569206272 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002649348044132398, + "loss": 2.7106, + "theoretical_loss": 3.5021729050785266, + "tokens_seen": 1569271808 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002649247743229689, + "loss": 2.7509, + "theoretical_loss": 3.5021601213542146, + "tokens_seen": 1569337344 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026491474423269814, + "loss": 2.8074, + "theoretical_loss": 3.502147338313215, + "tokens_seen": 1569402880 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026490471414242727, + "loss": 2.8234, + "theoretical_loss": 3.5021345559554633, + "tokens_seen": 1569468416 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2504634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.920652389526367, + "objective/train/theoretical_loss": 3.502121774280894, + "objective/train/tokens_used": 1589993952, + "theoretical_loss": 3.502121774280894, + "tokens_seen": 1569533952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002648946840521565, + "loss": 2.8097, + "theoretical_loss": 3.502121774280894, + "tokens_seen": 1569533952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002648846539618857, + "loss": 2.8496, + "theoretical_loss": 3.5021089932894416, + "tokens_seen": 1569599488 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026487462387161486, + "loss": 2.7674, + "theoretical_loss": 3.502096212981042, + "tokens_seen": 1569665024 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026486459378134404, + "loss": 2.727, + "theoretical_loss": 3.5020834333556294, + "tokens_seen": 1569730560 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002648545636910732, + "loss": 2.599, + "theoretical_loss": 3.5020706544131395, + "tokens_seen": 1569796096 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002648445336008024, + "loss": 2.8835, + "theoretical_loss": 3.502057876153507, + "tokens_seen": 1569861632 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026483450351053164, + "loss": 2.6363, + "theoretical_loss": 3.502045098576666, + "tokens_seen": 1569927168 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026482447342026077, + "loss": 2.7437, + "theoretical_loss": 3.5020323216825533, + "tokens_seen": 1569992704 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026481444332999, + "loss": 2.6798, + "theoretical_loss": 3.502019545471102, + "tokens_seen": 1570058240 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026480441323971913, + "loss": 2.7303, + "theoretical_loss": 3.502006769942249, + "tokens_seen": 1570123776 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026479438314944837, + "loss": 2.6658, + "theoretical_loss": 3.501993995095928, + "tokens_seen": 1570189312 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026478435305917755, + "loss": 2.8974, + "theoretical_loss": 3.501981220932074, + "tokens_seen": 1570254848 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026477432296890673, + "loss": 2.6982, + "theoretical_loss": 3.5019684474506234, + "tokens_seen": 1570320384 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002647642928786359, + "loss": 2.9025, + "theoretical_loss": 3.5019556746515104, + "tokens_seen": 1570385920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026475426278836514, + "loss": 2.8052, + "theoretical_loss": 3.5019429025346698, + "tokens_seen": 1570451456 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026474423269809427, + "loss": 2.8282, + "theoretical_loss": 3.501930131100037, + "tokens_seen": 1570516992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002647342026078235, + "loss": 2.8364, + "theoretical_loss": 3.501917360347547, + "tokens_seen": 1570582528 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026472417251755263, + "loss": 2.6937, + "theoretical_loss": 3.501904590277135, + "tokens_seen": 1570648064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026471414242728187, + "loss": 2.8052, + "theoretical_loss": 3.501891820888736, + "tokens_seen": 1570713600 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026470411233701105, + "loss": 2.9486, + "theoretical_loss": 3.5018790521822853, + "tokens_seen": 1570779136 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026469408224674023, + "loss": 2.7426, + "theoretical_loss": 3.501866284157718, + "tokens_seen": 1570844672 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002646840521564694, + "loss": 2.7499, + "theoretical_loss": 3.501853516814969, + "tokens_seen": 1570910208 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002646740220661986, + "loss": 2.9982, + "theoretical_loss": 3.501840750153974, + "tokens_seen": 1570975744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002646639919759278, + "loss": 2.7916, + "theoretical_loss": 3.5018279841746676, + "tokens_seen": 1571041280 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264653961885657, + "loss": 2.8291, + "theoretical_loss": 3.501815218876985, + "tokens_seen": 1571106816 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2507445, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9681766033172607, + "objective/train/theoretical_loss": 3.501802454260862, + "objective/train/tokens_used": 1591632352, + "theoretical_loss": 3.501802454260862, + "tokens_seen": 1571172352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026464393179538614, + "loss": 2.7369, + "theoretical_loss": 3.501802454260862, + "tokens_seen": 1571172352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026463390170511537, + "loss": 2.867, + "theoretical_loss": 3.501789690326233, + "tokens_seen": 1571237888 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002646238716148445, + "loss": 2.8883, + "theoretical_loss": 3.501776927073033, + "tokens_seen": 1571303424 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026461384152457373, + "loss": 2.6855, + "theoretical_loss": 3.5017641645011985, + "tokens_seen": 1571368960 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002646038114343029, + "loss": 2.6707, + "theoretical_loss": 3.5017514026106635, + "tokens_seen": 1571434496 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645937813440321, + "loss": 2.7715, + "theoretical_loss": 3.5017386414013636, + "tokens_seen": 1571500032 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645837512537613, + "loss": 2.6568, + "theoretical_loss": 3.5017258808732343, + "tokens_seen": 1571565568 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645737211634905, + "loss": 2.8747, + "theoretical_loss": 3.501713121026211, + "tokens_seen": 1571631104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026456369107321964, + "loss": 2.6573, + "theoretical_loss": 3.501700361860228, + "tokens_seen": 1571696640 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645536609829489, + "loss": 2.991, + "theoretical_loss": 3.5016876033752213, + "tokens_seen": 1571762176 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264543630892678, + "loss": 2.8322, + "theoretical_loss": 3.5016748455711255, + "tokens_seen": 1571827712 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026453360080240724, + "loss": 2.8268, + "theoretical_loss": 3.501662088447877, + "tokens_seen": 1571893248 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645235707121364, + "loss": 2.8769, + "theoretical_loss": 3.5016493320054103, + "tokens_seen": 1571958784 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645135406218656, + "loss": 2.8392, + "theoretical_loss": 3.5016365762436603, + "tokens_seen": 1572024320 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002645035105315948, + "loss": 2.6465, + "theoretical_loss": 3.501623821162563, + "tokens_seen": 1572089856 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026449348044132396, + "loss": 2.7931, + "theoretical_loss": 3.501611066762054, + "tokens_seen": 1572155392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026448345035105314, + "loss": 2.8533, + "theoretical_loss": 3.501598313042068, + "tokens_seen": 1572220928 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002644734202607824, + "loss": 2.7642, + "theoretical_loss": 3.50158556000254, + "tokens_seen": 1572286464 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002644633901705115, + "loss": 2.8029, + "theoretical_loss": 3.5015728076434063, + "tokens_seen": 1572352000 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026445336008024074, + "loss": 2.7723, + "theoretical_loss": 3.5015600559646014, + "tokens_seen": 1572417536 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026444332998996987, + "loss": 2.8481, + "theoretical_loss": 3.501547304966061, + "tokens_seen": 1572483072 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002644332998996991, + "loss": 2.8147, + "theoretical_loss": 3.5015345546477206, + "tokens_seen": 1572548608 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002644232698094283, + "loss": 2.8728, + "theoretical_loss": 3.5015218050095154, + "tokens_seen": 1572614144 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026441323971915747, + "loss": 2.496, + "theoretical_loss": 3.501509056051381, + "tokens_seen": 1572679680 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026440320962888665, + "loss": 2.8361, + "theoretical_loss": 3.501496307773252, + "tokens_seen": 1572745216 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2508906, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9019999504089355, + "objective/train/theoretical_loss": 3.501483560175065, + "objective/train/tokens_used": 1593270752, + "theoretical_loss": 3.501483560175065, + "tokens_seen": 1572810752 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002643931795386159, + "loss": 2.9392, + "theoretical_loss": 3.501483560175065, + "tokens_seen": 1572810752 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264383149448345, + "loss": 2.8022, + "theoretical_loss": 3.5014708132567547, + "tokens_seen": 1572876288 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026437311935807424, + "loss": 2.8017, + "theoretical_loss": 3.5014580670182562, + "tokens_seen": 1572941824 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026436308926780337, + "loss": 2.6748, + "theoretical_loss": 3.5014453214595056, + "tokens_seen": 1573007360 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002643530591775326, + "loss": 2.695, + "theoretical_loss": 3.501432576580438, + "tokens_seen": 1573072896 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002643430290872618, + "loss": 2.8783, + "theoretical_loss": 3.5014198323809893, + "tokens_seen": 1573138432 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026433299899699097, + "loss": 2.8754, + "theoretical_loss": 3.501407088861094, + "tokens_seen": 1573203968 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026432296890672015, + "loss": 2.7986, + "theoretical_loss": 3.501394346020688, + "tokens_seen": 1573269504 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026431293881644933, + "loss": 2.7684, + "theoretical_loss": 3.5013816038597074, + "tokens_seen": 1573335040 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002643029087261785, + "loss": 2.9431, + "theoretical_loss": 3.5013688623780865, + "tokens_seen": 1573400576 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026429287863590775, + "loss": 2.8664, + "theoretical_loss": 3.5013561215757623, + "tokens_seen": 1573466112 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002642828485456369, + "loss": 2.7441, + "theoretical_loss": 3.501343381452669, + "tokens_seen": 1573531648 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002642728184553661, + "loss": 2.8902, + "theoretical_loss": 3.5013306420087424, + "tokens_seen": 1573597184 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026426278836509534, + "loss": 2.7318, + "theoretical_loss": 3.501317903243918, + "tokens_seen": 1573662720 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026425275827482447, + "loss": 2.7545, + "theoretical_loss": 3.501305165158132, + "tokens_seen": 1573728256 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002642427281845537, + "loss": 2.8519, + "theoretical_loss": 3.501292427751319, + "tokens_seen": 1573793792 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026423269809428283, + "loss": 2.7439, + "theoretical_loss": 3.5012796910234147, + "tokens_seen": 1573859328 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026422266800401207, + "loss": 2.6815, + "theoretical_loss": 3.5012669549743554, + "tokens_seen": 1573924864 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026421263791374125, + "loss": 2.7468, + "theoretical_loss": 3.5012542196040757, + "tokens_seen": 1573990400 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026420260782347043, + "loss": 2.6672, + "theoretical_loss": 3.501241484912512, + "tokens_seen": 1574055936 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641925777331996, + "loss": 2.8273, + "theoretical_loss": 3.501228750899599, + "tokens_seen": 1574121472 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641825476429288, + "loss": 2.7027, + "theoretical_loss": 3.501216017565273, + "tokens_seen": 1574187008 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264172517552658, + "loss": 2.9165, + "theoretical_loss": 3.5012032849094696, + "tokens_seen": 1574252544 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641624874623872, + "loss": 2.9419, + "theoretical_loss": 3.501190552932124, + "tokens_seen": 1574318080 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026415245737211634, + "loss": 2.8957, + "theoretical_loss": 3.5011778216331715, + "tokens_seen": 1574383616 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2511371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7393381595611572, + "objective/train/theoretical_loss": 3.501165091012549, + "objective/train/tokens_used": 1594909152, + "theoretical_loss": 3.501165091012549, + "tokens_seen": 1574449152 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026414242728184557, + "loss": 2.8329, + "theoretical_loss": 3.501165091012549, + "tokens_seen": 1574449152 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641323971915747, + "loss": 2.8426, + "theoretical_loss": 3.5011523610701905, + "tokens_seen": 1574514688 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026412236710130393, + "loss": 2.9291, + "theoretical_loss": 3.5011396318060326, + "tokens_seen": 1574580224 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641123370110331, + "loss": 2.8468, + "theoretical_loss": 3.501126903220011, + "tokens_seen": 1574645760 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002641023069207623, + "loss": 2.8192, + "theoretical_loss": 3.5011141753120607, + "tokens_seen": 1574711296 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640922768304915, + "loss": 2.7586, + "theoretical_loss": 3.5011014480821183, + "tokens_seen": 1574776832 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640822467402207, + "loss": 2.8086, + "theoretical_loss": 3.5010887215301185, + "tokens_seen": 1574842368 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026407221664994984, + "loss": 2.787, + "theoretical_loss": 3.501075995655998, + "tokens_seen": 1574907904 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640621865596791, + "loss": 2.7886, + "theoretical_loss": 3.501063270459691, + "tokens_seen": 1574973440 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640521564694082, + "loss": 2.7035, + "theoretical_loss": 3.5010505459411343, + "tokens_seen": 1575038976 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026404212637913744, + "loss": 2.9222, + "theoretical_loss": 3.5010378221002636, + "tokens_seen": 1575104512 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640320962888666, + "loss": 2.8946, + "theoretical_loss": 3.501025098937015, + "tokens_seen": 1575170048 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002640220661985958, + "loss": 2.8382, + "theoretical_loss": 3.501012376451323, + "tokens_seen": 1575235584 + }, + { + "epoch": 4.06, + "learning_rate": 0.000264012036108325, + "loss": 2.7264, + "theoretical_loss": 3.500999654643124, + "tokens_seen": 1575301120 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026400200601805416, + "loss": 2.7012, + "theoretical_loss": 3.500986933512354, + "tokens_seen": 1575366656 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026399197592778334, + "loss": 2.8395, + "theoretical_loss": 3.5009742130589485, + "tokens_seen": 1575432192 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002639819458375126, + "loss": 2.7865, + "theoretical_loss": 3.500961493282843, + "tokens_seen": 1575497728 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002639719157472417, + "loss": 2.7622, + "theoretical_loss": 3.500948774183973, + "tokens_seen": 1575563264 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026396188565697094, + "loss": 2.7648, + "theoretical_loss": 3.500936055762275, + "tokens_seen": 1575628800 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026395185556670007, + "loss": 2.7916, + "theoretical_loss": 3.500923338017685, + "tokens_seen": 1575694336 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002639418254764293, + "loss": 2.8238, + "theoretical_loss": 3.500910620950138, + "tokens_seen": 1575759872 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002639317953861585, + "loss": 2.8174, + "theoretical_loss": 3.5008979045595705, + "tokens_seen": 1575825408 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026392176529588767, + "loss": 2.8172, + "theoretical_loss": 3.500885188845918, + "tokens_seen": 1575890944 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026391173520561685, + "loss": 2.8379, + "theoretical_loss": 3.5008724738091157, + "tokens_seen": 1575956480 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002639017051153461, + "loss": 2.7443, + "theoretical_loss": 3.5008597594491, + "tokens_seen": 1576022016 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2514179, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.583534002304077, + "objective/train/theoretical_loss": 3.5008470457658074, + "objective/train/tokens_used": 1596547552, + "theoretical_loss": 3.5008470457658074, + "tokens_seen": 1576087552 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002638916750250752, + "loss": 2.6768, + "theoretical_loss": 3.5008470457658074, + "tokens_seen": 1576087552 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026388164493480444, + "loss": 2.7219, + "theoretical_loss": 3.5008343327591724, + "tokens_seen": 1576153088 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026387161484453357, + "loss": 2.8205, + "theoretical_loss": 3.5008216204291314, + "tokens_seen": 1576218624 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002638615847542628, + "loss": 2.8185, + "theoretical_loss": 3.500808908775621, + "tokens_seen": 1576284160 + }, + { + "epoch": 4.06, + "learning_rate": 0.000263851554663992, + "loss": 2.8078, + "theoretical_loss": 3.500796197798576, + "tokens_seen": 1576349696 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026384152457372117, + "loss": 2.7631, + "theoretical_loss": 3.500783487497933, + "tokens_seen": 1576415232 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026383149448345035, + "loss": 2.8298, + "theoretical_loss": 3.500770777873628, + "tokens_seen": 1576480768 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026382146439317953, + "loss": 2.9113, + "theoretical_loss": 3.5007580689255957, + "tokens_seen": 1576546304 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002638114343029087, + "loss": 2.773, + "theoretical_loss": 3.5007453606537733, + "tokens_seen": 1576611840 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026380140421263795, + "loss": 2.7557, + "theoretical_loss": 3.500732653058096, + "tokens_seen": 1576677376 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637913741223671, + "loss": 2.685, + "theoretical_loss": 3.5007199461385, + "tokens_seen": 1576742912 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637813440320963, + "loss": 2.6875, + "theoretical_loss": 3.5007072398949215, + "tokens_seen": 1576808448 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637713139418255, + "loss": 2.8872, + "theoretical_loss": 3.500694534327296, + "tokens_seen": 1576873984 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026376128385155467, + "loss": 2.7418, + "theoretical_loss": 3.5006818294355595, + "tokens_seen": 1576939520 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026375125376128385, + "loss": 2.7668, + "theoretical_loss": 3.500669125219648, + "tokens_seen": 1577005056 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026374122367101303, + "loss": 2.8053, + "theoretical_loss": 3.5006564216794978, + "tokens_seen": 1577070592 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637311935807422, + "loss": 2.8739, + "theoretical_loss": 3.500643718815044, + "tokens_seen": 1577136128 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026372116349047145, + "loss": 2.8009, + "theoretical_loss": 3.5006310166262233, + "tokens_seen": 1577201664 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637111334002006, + "loss": 2.7476, + "theoretical_loss": 3.500618315112972, + "tokens_seen": 1577267200 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002637011033099298, + "loss": 2.9269, + "theoretical_loss": 3.5006056142752255, + "tokens_seen": 1577332736 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026369107321965894, + "loss": 2.7809, + "theoretical_loss": 3.5005929141129197, + "tokens_seen": 1577398272 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636810431293882, + "loss": 2.8479, + "theoretical_loss": 3.500580214625991, + "tokens_seen": 1577463808 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026367101303911736, + "loss": 2.8554, + "theoretical_loss": 3.5005675158143754, + "tokens_seen": 1577529344 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026366098294884654, + "loss": 2.8175, + "theoretical_loss": 3.500554817678009, + "tokens_seen": 1577594880 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636509528585757, + "loss": 3.0443, + "theoretical_loss": 3.500542120216828, + "tokens_seen": 1577660416 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2517085, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2192492485046387, + "objective/train/theoretical_loss": 3.5005294234307676, + "objective/train/tokens_used": 1598185952, + "theoretical_loss": 3.5005294234307676, + "tokens_seen": 1577725952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636409227683049, + "loss": 2.8687, + "theoretical_loss": 3.5005294234307676, + "tokens_seen": 1577725952 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636308926780341, + "loss": 2.7769, + "theoretical_loss": 3.5005167273197646, + "tokens_seen": 1577791488 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636208625877633, + "loss": 2.8132, + "theoretical_loss": 3.5005040318837546, + "tokens_seen": 1577857024 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026361083249749244, + "loss": 2.7382, + "theoretical_loss": 3.5004913371226745, + "tokens_seen": 1577922560 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002636008024072217, + "loss": 2.7008, + "theoretical_loss": 3.5004786430364594, + "tokens_seen": 1577988096 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026359077231695086, + "loss": 2.8748, + "theoretical_loss": 3.5004659496250463, + "tokens_seen": 1578053632 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026358074222668004, + "loss": 2.6991, + "theoretical_loss": 3.5004532568883704, + "tokens_seen": 1578119168 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002635707121364092, + "loss": 2.7904, + "theoretical_loss": 3.5004405648263686, + "tokens_seen": 1578184704 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002635606820461384, + "loss": 2.8915, + "theoretical_loss": 3.5004278734389764, + "tokens_seen": 1578250240 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002635506519558676, + "loss": 2.7567, + "theoretical_loss": 3.5004151827261305, + "tokens_seen": 1578315776 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002635406218655968, + "loss": 2.9123, + "theoretical_loss": 3.500402492687767, + "tokens_seen": 1578381312 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026353059177532595, + "loss": 2.9028, + "theoretical_loss": 3.500389803323821, + "tokens_seen": 1578446848 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002635205616850552, + "loss": 2.9587, + "theoretical_loss": 3.5003771146342304, + "tokens_seen": 1578512384 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026351053159478436, + "loss": 2.779, + "theoretical_loss": 3.50036442661893, + "tokens_seen": 1578577920 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026350050150451354, + "loss": 2.8977, + "theoretical_loss": 3.500351739277857, + "tokens_seen": 1578643456 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634904714142428, + "loss": 2.536, + "theoretical_loss": 3.500339052610946, + "tokens_seen": 1578708992 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634804413239719, + "loss": 2.7309, + "theoretical_loss": 3.5003263666181352, + "tokens_seen": 1578774528 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026347041123370114, + "loss": 2.8008, + "theoretical_loss": 3.5003136812993594, + "tokens_seen": 1578840064 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026346038114343027, + "loss": 2.8848, + "theoretical_loss": 3.5003009966545546, + "tokens_seen": 1578905600 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634503510531595, + "loss": 2.824, + "theoretical_loss": 3.500288312683659, + "tokens_seen": 1578971136 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634403209628887, + "loss": 2.7889, + "theoretical_loss": 3.500275629386606, + "tokens_seen": 1579036672 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026343029087261787, + "loss": 2.8574, + "theoretical_loss": 3.500262946763334, + "tokens_seen": 1579102208 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026342026078234705, + "loss": 2.8993, + "theoretical_loss": 3.5002502648137788, + "tokens_seen": 1579167744 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634102306920763, + "loss": 2.8925, + "theoretical_loss": 3.500237583537876, + "tokens_seen": 1579233280 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002634002006018054, + "loss": 2.66, + "theoretical_loss": 3.5002249029355625, + "tokens_seen": 1579298816 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2519888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9735991954803467, + "objective/train/theoretical_loss": 3.5002122230067743, + "objective/train/tokens_used": 1599824352, + "theoretical_loss": 3.5002122230067743, + "tokens_seen": 1579364352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026339017051153464, + "loss": 2.8806, + "theoretical_loss": 3.5002122230067743, + "tokens_seen": 1579364352 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026338014042126377, + "loss": 2.7623, + "theoretical_loss": 3.5001995437514477, + "tokens_seen": 1579429888 + }, + { + "epoch": 4.06, + "learning_rate": 0.000263370110330993, + "loss": 2.8006, + "theoretical_loss": 3.500186865169519, + "tokens_seen": 1579495424 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002633600802407222, + "loss": 2.932, + "theoretical_loss": 3.5001741872609244, + "tokens_seen": 1579560960 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026335005015045137, + "loss": 2.8041, + "theoretical_loss": 3.5001615100256007, + "tokens_seen": 1579626496 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026334002006018055, + "loss": 2.8474, + "theoretical_loss": 3.5001488334634834, + "tokens_seen": 1579692032 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026332998996990973, + "loss": 2.7611, + "theoretical_loss": 3.5001361575745094, + "tokens_seen": 1579757568 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002633199598796389, + "loss": 2.7415, + "theoretical_loss": 3.5001234823586147, + "tokens_seen": 1579823104 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026330992978936815, + "loss": 2.8832, + "theoretical_loss": 3.500110807815736, + "tokens_seen": 1579888640 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002632998996990973, + "loss": 2.9128, + "theoretical_loss": 3.5000981339458095, + "tokens_seen": 1579954176 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002632898696088265, + "loss": 2.7475, + "theoretical_loss": 3.5000854607487715, + "tokens_seen": 1580019712 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002632798395185557, + "loss": 2.8376, + "theoretical_loss": 3.5000727882245584, + "tokens_seen": 1580085248 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026326980942828487, + "loss": 2.7102, + "theoretical_loss": 3.500060116373106, + "tokens_seen": 1580150784 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026325977933801405, + "loss": 2.7818, + "theoretical_loss": 3.5000474451943524, + "tokens_seen": 1580216320 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026324974924774323, + "loss": 2.9683, + "theoretical_loss": 3.500034774688232, + "tokens_seen": 1580281856 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002632397191574724, + "loss": 2.8217, + "theoretical_loss": 3.5000221048546827, + "tokens_seen": 1580347392 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026322968906720165, + "loss": 2.8596, + "theoretical_loss": 3.5000094356936398, + "tokens_seen": 1580412928 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002632196589769308, + "loss": 2.8016, + "theoretical_loss": 3.4999967672050403, + "tokens_seen": 1580478464 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026320962888666, + "loss": 2.9447, + "theoretical_loss": 3.4999840993888203, + "tokens_seen": 1580544000 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026319959879638914, + "loss": 2.9651, + "theoretical_loss": 3.4999714322449167, + "tokens_seen": 1580609536 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631895687061184, + "loss": 2.8711, + "theoretical_loss": 3.4999587657732656, + "tokens_seen": 1580675072 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026317953861584756, + "loss": 2.8782, + "theoretical_loss": 3.4999460999738035, + "tokens_seen": 1580740608 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026316950852557674, + "loss": 2.8181, + "theoretical_loss": 3.499933434846467, + "tokens_seen": 1580806144 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631594784353059, + "loss": 2.7439, + "theoretical_loss": 3.499920770391192, + "tokens_seen": 1580871680 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631494483450351, + "loss": 2.6889, + "theoretical_loss": 3.499908106607916, + "tokens_seen": 1580937216 + }, + { + "epoch": 4.06, + "objective/train/docs_used": 2522805, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7724010944366455, + "objective/train/theoretical_loss": 3.499895443496575, + "objective/train/tokens_used": 1601462752, + "theoretical_loss": 3.499895443496575, + "tokens_seen": 1581002752 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631394182547643, + "loss": 2.7979, + "theoretical_loss": 3.499895443496575, + "tokens_seen": 1581002752 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631293881644935, + "loss": 2.7287, + "theoretical_loss": 3.499882781057105, + "tokens_seen": 1581068288 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026311935807422264, + "loss": 2.6902, + "theoretical_loss": 3.4998701192894437, + "tokens_seen": 1581133824 + }, + { + "epoch": 4.06, + "learning_rate": 0.0002631093279839519, + "loss": 2.8311, + "theoretical_loss": 3.4998574581935262, + "tokens_seen": 1581199360 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026309929789368106, + "loss": 2.898, + "theoretical_loss": 3.4998447977692897, + "tokens_seen": 1581264896 + }, + { + "epoch": 4.06, + "learning_rate": 0.00026308926780341024, + "loss": 2.8918, + "theoretical_loss": 3.4998321380166706, + "tokens_seen": 1581330432 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002630792377131394, + "loss": 2.8154, + "theoretical_loss": 3.4998194789356063, + "tokens_seen": 1581395968 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002630692076228686, + "loss": 2.8666, + "theoretical_loss": 3.4998068205260315, + "tokens_seen": 1581461504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002630591775325978, + "loss": 2.8677, + "theoretical_loss": 3.499794162787885, + "tokens_seen": 1581527040 + }, + { + "epoch": 4.07, + "learning_rate": 0.000263049147442327, + "loss": 2.8812, + "theoretical_loss": 3.4997815057211015, + "tokens_seen": 1581592576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026303911735205615, + "loss": 2.6761, + "theoretical_loss": 3.4997688493256183, + "tokens_seen": 1581658112 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002630290872617854, + "loss": 2.7152, + "theoretical_loss": 3.499756193601372, + "tokens_seen": 1581723648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002630190571715145, + "loss": 2.7239, + "theoretical_loss": 3.4997435385483, + "tokens_seen": 1581789184 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026300902708124374, + "loss": 2.8386, + "theoretical_loss": 3.499730884166337, + "tokens_seen": 1581854720 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002629989969909729, + "loss": 2.9386, + "theoretical_loss": 3.4997182304554206, + "tokens_seen": 1581920256 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002629889669007021, + "loss": 2.827, + "theoretical_loss": 3.4997055774154884, + "tokens_seen": 1581985792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002629789368104313, + "loss": 2.6643, + "theoretical_loss": 3.4996929250464754, + "tokens_seen": 1582051328 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026296890672016047, + "loss": 2.8889, + "theoretical_loss": 3.499680273348319, + "tokens_seen": 1582116864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026295887662988965, + "loss": 2.9362, + "theoretical_loss": 3.499667622320956, + "tokens_seen": 1582182400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002629488465396189, + "loss": 2.7408, + "theoretical_loss": 3.499654971964323, + "tokens_seen": 1582247936 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262938816449348, + "loss": 2.7963, + "theoretical_loss": 3.4996423222783566, + "tokens_seen": 1582313472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026292878635907725, + "loss": 2.9582, + "theoretical_loss": 3.4996296732629926, + "tokens_seen": 1582379008 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026291875626880643, + "loss": 2.7513, + "theoretical_loss": 3.4996170249181695, + "tokens_seen": 1582444544 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002629087261785356, + "loss": 2.8435, + "theoretical_loss": 3.499604377243822, + "tokens_seen": 1582510080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628986960882648, + "loss": 2.8472, + "theoretical_loss": 3.4995917302398882, + "tokens_seen": 1582575616 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2525316, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0229058265686035, + "objective/train/theoretical_loss": 3.499579083906305, + "objective/train/tokens_used": 1603101152, + "theoretical_loss": 3.499579083906305, + "tokens_seen": 1582641152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026288866599799397, + "loss": 2.8762, + "theoretical_loss": 3.499579083906305, + "tokens_seen": 1582641152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026287863590772315, + "loss": 2.9313, + "theoretical_loss": 3.499566438243007, + "tokens_seen": 1582706688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628686058174524, + "loss": 2.7795, + "theoretical_loss": 3.4995537932499334, + "tokens_seen": 1582772224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628585757271815, + "loss": 2.5837, + "theoretical_loss": 3.49954114892702, + "tokens_seen": 1582837760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026284854563691075, + "loss": 2.9737, + "theoretical_loss": 3.4995285052742027, + "tokens_seen": 1582903296 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628385155466399, + "loss": 2.953, + "theoretical_loss": 3.4995158622914193, + "tokens_seen": 1582968832 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628284854563691, + "loss": 2.7596, + "theoretical_loss": 3.4995032199786062, + "tokens_seen": 1583034368 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628184553660983, + "loss": 2.7388, + "theoretical_loss": 3.4994905783357004, + "tokens_seen": 1583099904 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002628084252758275, + "loss": 2.6681, + "theoretical_loss": 3.4994779373626383, + "tokens_seen": 1583165440 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026279839518555666, + "loss": 2.8248, + "theoretical_loss": 3.499465297059357, + "tokens_seen": 1583230976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002627883650952859, + "loss": 2.8535, + "theoretical_loss": 3.4994526574257927, + "tokens_seen": 1583296512 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262778335005015, + "loss": 2.9321, + "theoretical_loss": 3.4994400184618835, + "tokens_seen": 1583362048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026276830491474425, + "loss": 2.6967, + "theoretical_loss": 3.499427380167565, + "tokens_seen": 1583427584 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026275827482447343, + "loss": 2.6967, + "theoretical_loss": 3.4994147425427737, + "tokens_seen": 1583493120 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002627482447342026, + "loss": 2.72, + "theoretical_loss": 3.499402105587448, + "tokens_seen": 1583558656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026273821464393185, + "loss": 2.9168, + "theoretical_loss": 3.499389469301523, + "tokens_seen": 1583624192 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262728184553661, + "loss": 2.7737, + "theoretical_loss": 3.499376833684937, + "tokens_seen": 1583689728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002627181544633902, + "loss": 2.6628, + "theoretical_loss": 3.499364198737626, + "tokens_seen": 1583755264 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026270812437311934, + "loss": 2.8132, + "theoretical_loss": 3.499351564459527, + "tokens_seen": 1583820800 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626980942828486, + "loss": 2.845, + "theoretical_loss": 3.499338930850577, + "tokens_seen": 1583886336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026268806419257776, + "loss": 2.7814, + "theoretical_loss": 3.499326297910713, + "tokens_seen": 1583951872 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026267803410230694, + "loss": 2.7994, + "theoretical_loss": 3.499313665639871, + "tokens_seen": 1584017408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626680040120361, + "loss": 2.7557, + "theoretical_loss": 3.4993010340379893, + "tokens_seen": 1584082944 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626579739217653, + "loss": 2.7493, + "theoretical_loss": 3.4992884031050036, + "tokens_seen": 1584148480 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626479438314945, + "loss": 2.9729, + "theoretical_loss": 3.4992757728408517, + "tokens_seen": 1584214016 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2528122, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.981548309326172, + "objective/train/theoretical_loss": 3.4992631432454697, + "objective/train/tokens_used": 1604739552, + "theoretical_loss": 3.4992631432454697, + "tokens_seen": 1584279552 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626379137412237, + "loss": 2.8185, + "theoretical_loss": 3.4992631432454697, + "tokens_seen": 1584279552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026262788365095284, + "loss": 2.9673, + "theoretical_loss": 3.4992505143187955, + "tokens_seen": 1584345088 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002626178535606821, + "loss": 2.8557, + "theoretical_loss": 3.499237886060765, + "tokens_seen": 1584410624 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026260782347041126, + "loss": 2.7248, + "theoretical_loss": 3.499225258471316, + "tokens_seen": 1584476160 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026259779338014044, + "loss": 2.7944, + "theoretical_loss": 3.4992126315503844, + "tokens_seen": 1584541696 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625877632898696, + "loss": 2.8818, + "theoretical_loss": 3.499200005297909, + "tokens_seen": 1584607232 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625777331995988, + "loss": 2.8499, + "theoretical_loss": 3.4991873797138244, + "tokens_seen": 1584672768 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262567703109328, + "loss": 2.7129, + "theoretical_loss": 3.4991747547980694, + "tokens_seen": 1584738304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625576730190572, + "loss": 2.7323, + "theoretical_loss": 3.4991621305505802, + "tokens_seen": 1584803840 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026254764292878635, + "loss": 2.733, + "theoretical_loss": 3.499149506971294, + "tokens_seen": 1584869376 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625376128385156, + "loss": 2.7367, + "theoretical_loss": 3.499136884060148, + "tokens_seen": 1584934912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625275827482447, + "loss": 2.6826, + "theoretical_loss": 3.4991242618170784, + "tokens_seen": 1585000448 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026251755265797394, + "loss": 2.8677, + "theoretical_loss": 3.499111640242023, + "tokens_seen": 1585065984 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002625075225677031, + "loss": 2.7882, + "theoretical_loss": 3.4990990193349187, + "tokens_seen": 1585131520 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002624974924774323, + "loss": 2.7902, + "theoretical_loss": 3.4990863990957024, + "tokens_seen": 1585197056 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002624874623871615, + "loss": 2.6673, + "theoretical_loss": 3.499073779524311, + "tokens_seen": 1585262592 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026247743229689067, + "loss": 2.7599, + "theoretical_loss": 3.4990611606206823, + "tokens_seen": 1585328128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026246740220661985, + "loss": 2.6694, + "theoretical_loss": 3.4990485423847524, + "tokens_seen": 1585393664 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002624573721163491, + "loss": 2.8013, + "theoretical_loss": 3.499035924816459, + "tokens_seen": 1585459200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002624473420260782, + "loss": 2.9166, + "theoretical_loss": 3.499023307915739, + "tokens_seen": 1585524736 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026243731193580745, + "loss": 2.6578, + "theoretical_loss": 3.499010691682529, + "tokens_seen": 1585590272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026242728184553663, + "loss": 2.8011, + "theoretical_loss": 3.498998076116767, + "tokens_seen": 1585655808 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002624172517552658, + "loss": 2.7919, + "theoretical_loss": 3.49898546121839, + "tokens_seen": 1585721344 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262407221664995, + "loss": 2.8977, + "theoretical_loss": 3.498972846987334, + "tokens_seen": 1585786880 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026239719157472417, + "loss": 2.8793, + "theoretical_loss": 3.4989602334235372, + "tokens_seen": 1585852416 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2530888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8350446224212646, + "objective/train/theoretical_loss": 3.4989476205269368, + "objective/train/tokens_used": 1606377952, + "theoretical_loss": 3.4989476205269368, + "tokens_seen": 1585917952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026238716148445335, + "loss": 2.7909, + "theoretical_loss": 3.4989476205269368, + "tokens_seen": 1585917952 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623771313941826, + "loss": 2.8537, + "theoretical_loss": 3.498935008297469, + "tokens_seen": 1585983488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623671013039117, + "loss": 2.8225, + "theoretical_loss": 3.4989223967350718, + "tokens_seen": 1586049024 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026235707121364095, + "loss": 2.8201, + "theoretical_loss": 3.4989097858396816, + "tokens_seen": 1586114560 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623470411233701, + "loss": 2.7745, + "theoretical_loss": 3.498897175611237, + "tokens_seen": 1586180096 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623370110330993, + "loss": 2.8636, + "theoretical_loss": 3.4988845660496732, + "tokens_seen": 1586245632 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623269809428285, + "loss": 2.8332, + "theoretical_loss": 3.498871957154929, + "tokens_seen": 1586311168 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002623169508525577, + "loss": 2.8359, + "theoretical_loss": 3.498859348926941, + "tokens_seen": 1586376704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026230692076228686, + "loss": 2.6791, + "theoretical_loss": 3.4988467413656466, + "tokens_seen": 1586442240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622968906720161, + "loss": 2.8131, + "theoretical_loss": 3.4988341344709823, + "tokens_seen": 1586507776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622868605817452, + "loss": 2.6866, + "theoretical_loss": 3.498821528242886, + "tokens_seen": 1586573312 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026227683049147445, + "loss": 2.7963, + "theoretical_loss": 3.4988089226812944, + "tokens_seen": 1586638848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622668004012036, + "loss": 2.6026, + "theoretical_loss": 3.4987963177861454, + "tokens_seen": 1586704384 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622567703109328, + "loss": 2.8938, + "theoretical_loss": 3.4987837135573763, + "tokens_seen": 1586769920 + }, + { + "epoch": 4.07, + "learning_rate": 0.000262246740220662, + "loss": 2.9248, + "theoretical_loss": 3.498771109994923, + "tokens_seen": 1586835456 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622367101303912, + "loss": 2.8715, + "theoretical_loss": 3.4987585070987244, + "tokens_seen": 1586900992 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026222668004012036, + "loss": 2.8087, + "theoretical_loss": 3.4987459048687173, + "tokens_seen": 1586966528 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026221664994984954, + "loss": 2.7894, + "theoretical_loss": 3.4987333033048382, + "tokens_seen": 1587032064 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002622066198595787, + "loss": 2.9342, + "theoretical_loss": 3.4987207024070255, + "tokens_seen": 1587097600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026219658976930796, + "loss": 2.735, + "theoretical_loss": 3.4987081021752156, + "tokens_seen": 1587163136 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002621865596790371, + "loss": 2.6882, + "theoretical_loss": 3.498695502609346, + "tokens_seen": 1587228672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002621765295887663, + "loss": 2.8659, + "theoretical_loss": 3.498682903709354, + "tokens_seen": 1587294208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026216649949849545, + "loss": 2.6203, + "theoretical_loss": 3.4986703054751778, + "tokens_seen": 1587359744 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002621564694082247, + "loss": 2.8891, + "theoretical_loss": 3.4986577079067533, + "tokens_seen": 1587425280 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026214643931795386, + "loss": 2.683, + "theoretical_loss": 3.498645111004019, + "tokens_seen": 1587490816 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2532371, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9819259643554688, + "objective/train/theoretical_loss": 3.4986325147669115, + "objective/train/tokens_used": 1608016352, + "theoretical_loss": 3.4986325147669115, + "tokens_seen": 1587556352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026213640922768304, + "loss": 2.7512, + "theoretical_loss": 3.4986325147669115, + "tokens_seen": 1587556352 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002621263791374122, + "loss": 2.8334, + "theoretical_loss": 3.4986199191953684, + "tokens_seen": 1587621888 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026211634904714146, + "loss": 2.9837, + "theoretical_loss": 3.4986073242893276, + "tokens_seen": 1587687424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002621063189568706, + "loss": 2.6931, + "theoretical_loss": 3.4985947300487252, + "tokens_seen": 1587752960 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620962888665998, + "loss": 2.853, + "theoretical_loss": 3.4985821364735, + "tokens_seen": 1587818496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026208625877632895, + "loss": 2.8107, + "theoretical_loss": 3.4985695435635886, + "tokens_seen": 1587884032 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620762286860582, + "loss": 2.8931, + "theoretical_loss": 3.4985569513189283, + "tokens_seen": 1587949568 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026206619859578737, + "loss": 2.8968, + "theoretical_loss": 3.4985443597394568, + "tokens_seen": 1588015104 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026205616850551655, + "loss": 2.6996, + "theoretical_loss": 3.4985317688251114, + "tokens_seen": 1588080640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026204613841524573, + "loss": 2.6728, + "theoretical_loss": 3.4985191785758296, + "tokens_seen": 1588146176 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620361083249749, + "loss": 2.7861, + "theoretical_loss": 3.4985065889915488, + "tokens_seen": 1588211712 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620260782347041, + "loss": 2.8939, + "theoretical_loss": 3.4984940000722062, + "tokens_seen": 1588277248 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620160481444333, + "loss": 2.7011, + "theoretical_loss": 3.49848141181774, + "tokens_seen": 1588342784 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002620060180541625, + "loss": 2.6965, + "theoretical_loss": 3.4984688242280866, + "tokens_seen": 1588408320 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002619959879638917, + "loss": 2.7259, + "theoretical_loss": 3.498456237303184, + "tokens_seen": 1588473856 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026198595787362087, + "loss": 2.8411, + "theoretical_loss": 3.4984436510429697, + "tokens_seen": 1588539392 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026197592778335005, + "loss": 2.8411, + "theoretical_loss": 3.498431065447381, + "tokens_seen": 1588604928 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002619658976930793, + "loss": 2.8254, + "theoretical_loss": 3.4984184805163556, + "tokens_seen": 1588670464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002619558676028084, + "loss": 2.811, + "theoretical_loss": 3.4984058962498317, + "tokens_seen": 1588736000 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026194583751253765, + "loss": 2.777, + "theoretical_loss": 3.498393312647745, + "tokens_seen": 1588801536 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026193580742226683, + "loss": 2.8285, + "theoretical_loss": 3.4983807297100347, + "tokens_seen": 1588867072 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261925777331996, + "loss": 2.7699, + "theoretical_loss": 3.4983681474366373, + "tokens_seen": 1588932608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002619157472417252, + "loss": 2.9317, + "theoretical_loss": 3.4983555658274907, + "tokens_seen": 1588998144 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026190571715145437, + "loss": 2.7468, + "theoretical_loss": 3.498342984882532, + "tokens_seen": 1589063680 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026189568706118355, + "loss": 2.7318, + "theoretical_loss": 3.4983304046017, + "tokens_seen": 1589129216 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2535204, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9600729942321777, + "objective/train/theoretical_loss": 3.498317824984931, + "objective/train/tokens_used": 1609654752, + "theoretical_loss": 3.498317824984931, + "tokens_seen": 1589194752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618856569709128, + "loss": 2.7896, + "theoretical_loss": 3.498317824984931, + "tokens_seen": 1589194752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618756268806419, + "loss": 2.6826, + "theoretical_loss": 3.498305246032163, + "tokens_seen": 1589260288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026186559679037115, + "loss": 2.7643, + "theoretical_loss": 3.4982926677433337, + "tokens_seen": 1589325824 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618555667001003, + "loss": 2.723, + "theoretical_loss": 3.4982800901183806, + "tokens_seen": 1589391360 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618455366098295, + "loss": 2.8667, + "theoretical_loss": 3.498267513157241, + "tokens_seen": 1589456896 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618355065195587, + "loss": 2.7268, + "theoretical_loss": 3.498254936859853, + "tokens_seen": 1589522432 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618254764292879, + "loss": 2.6366, + "theoretical_loss": 3.4982423612261537, + "tokens_seen": 1589587968 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026181544633901706, + "loss": 2.7428, + "theoretical_loss": 3.498229786256081, + "tokens_seen": 1589653504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002618054162487463, + "loss": 2.8178, + "theoretical_loss": 3.4982172119495725, + "tokens_seen": 1589719040 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002617953861584754, + "loss": 2.792, + "theoretical_loss": 3.4982046383065657, + "tokens_seen": 1589784576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026178535606820465, + "loss": 2.6685, + "theoretical_loss": 3.4981920653269984, + "tokens_seen": 1589850112 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002617753259779338, + "loss": 2.778, + "theoretical_loss": 3.4981794930108085, + "tokens_seen": 1589915648 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261765295887663, + "loss": 2.9853, + "theoretical_loss": 3.4981669213579325, + "tokens_seen": 1589981184 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002617552657973922, + "loss": 2.8276, + "theoretical_loss": 3.49815435036831, + "tokens_seen": 1590046720 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002617452357071214, + "loss": 2.8339, + "theoretical_loss": 3.498141780041877, + "tokens_seen": 1590112256 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026173520561685056, + "loss": 2.6578, + "theoretical_loss": 3.498129210378572, + "tokens_seen": 1590177792 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026172517552657974, + "loss": 2.8457, + "theoretical_loss": 3.4981166413783322, + "tokens_seen": 1590243328 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002617151454363089, + "loss": 2.7875, + "theoretical_loss": 3.4981040730410955, + "tokens_seen": 1590308864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026170511534603816, + "loss": 2.7728, + "theoretical_loss": 3.4980915053667996, + "tokens_seen": 1590374400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002616950852557673, + "loss": 2.8829, + "theoretical_loss": 3.4980789383553827, + "tokens_seen": 1590439936 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002616850551654965, + "loss": 2.8924, + "theoretical_loss": 3.4980663720067824, + "tokens_seen": 1590505472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026167502507522565, + "loss": 2.7824, + "theoretical_loss": 3.4980538063209354, + "tokens_seen": 1590571008 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002616649949849549, + "loss": 2.7706, + "theoretical_loss": 3.4980412412977806, + "tokens_seen": 1590636544 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026165496489468406, + "loss": 2.8863, + "theoretical_loss": 3.498028676937255, + "tokens_seen": 1590702080 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026164493480441324, + "loss": 2.6568, + "theoretical_loss": 3.498016113239297, + "tokens_seen": 1590767616 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2537594, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0076663494110107, + "objective/train/theoretical_loss": 3.4980035502038436, + "objective/train/tokens_used": 1611293152, + "theoretical_loss": 3.4980035502038436, + "tokens_seen": 1590833152 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002616349047141424, + "loss": 2.84, + "theoretical_loss": 3.4980035502038436, + "tokens_seen": 1590833152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026162487462387166, + "loss": 2.8916, + "theoretical_loss": 3.4979909878308333, + "tokens_seen": 1590898688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002616148445336008, + "loss": 2.8142, + "theoretical_loss": 3.497978426120204, + "tokens_seen": 1590964224 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026160481444333, + "loss": 2.8955, + "theoretical_loss": 3.4979658650718926, + "tokens_seen": 1591029760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026159478435305915, + "loss": 2.9072, + "theoretical_loss": 3.4979533046858373, + "tokens_seen": 1591095296 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002615847542627884, + "loss": 2.6315, + "theoretical_loss": 3.4979407449619764, + "tokens_seen": 1591160832 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026157472417251757, + "loss": 2.6572, + "theoretical_loss": 3.497928185900247, + "tokens_seen": 1591226368 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026156469408224675, + "loss": 2.7623, + "theoretical_loss": 3.497915627500588, + "tokens_seen": 1591291904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026155466399197593, + "loss": 2.717, + "theoretical_loss": 3.497903069762936, + "tokens_seen": 1591357440 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002615446339017051, + "loss": 2.6292, + "theoretical_loss": 3.4978905126872286, + "tokens_seen": 1591422976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002615346038114343, + "loss": 2.6808, + "theoretical_loss": 3.4978779562734053, + "tokens_seen": 1591488512 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002615245737211635, + "loss": 2.7921, + "theoretical_loss": 3.4978654005214027, + "tokens_seen": 1591554048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026151454363089265, + "loss": 2.7432, + "theoretical_loss": 3.4978528454311593, + "tokens_seen": 1591619584 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002615045135406219, + "loss": 2.8878, + "theoretical_loss": 3.4978402910026123, + "tokens_seen": 1591685120 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261494483450351, + "loss": 2.9099, + "theoretical_loss": 3.4978277372357, + "tokens_seen": 1591750656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026148445336008025, + "loss": 2.8856, + "theoretical_loss": 3.497815184130361, + "tokens_seen": 1591816192 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026147442326980943, + "loss": 2.8258, + "theoretical_loss": 3.4978026316865316, + "tokens_seen": 1591881728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002614643931795386, + "loss": 2.8651, + "theoretical_loss": 3.497790079904151, + "tokens_seen": 1591947264 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002614543630892678, + "loss": 2.8295, + "theoretical_loss": 3.497777528783156, + "tokens_seen": 1592012800 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026144433299899703, + "loss": 2.6007, + "theoretical_loss": 3.497764978323486, + "tokens_seen": 1592078336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026143430290872616, + "loss": 2.8525, + "theoretical_loss": 3.497752428525078, + "tokens_seen": 1592143872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002614242728184554, + "loss": 2.7941, + "theoretical_loss": 3.4977398793878693, + "tokens_seen": 1592209408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002614142427281845, + "loss": 2.6242, + "theoretical_loss": 3.4977273309117995, + "tokens_seen": 1592274944 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026140421263791375, + "loss": 2.8988, + "theoretical_loss": 3.4977147830968054, + "tokens_seen": 1592340480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026139418254764293, + "loss": 2.7066, + "theoretical_loss": 3.4977022359428256, + "tokens_seen": 1592406016 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2540556, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.898395299911499, + "objective/train/theoretical_loss": 3.497689689449797, + "objective/train/tokens_used": 1612931552, + "theoretical_loss": 3.497689689449797, + "tokens_seen": 1592471552 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613841524573721, + "loss": 2.8778, + "theoretical_loss": 3.497689689449797, + "tokens_seen": 1592471552 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613741223671013, + "loss": 2.9203, + "theoretical_loss": 3.497677143617659, + "tokens_seen": 1592537088 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613640922768305, + "loss": 2.7375, + "theoretical_loss": 3.497664598446349, + "tokens_seen": 1592602624 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026135406218655966, + "loss": 2.7495, + "theoretical_loss": 3.4976520539358043, + "tokens_seen": 1592668160 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613440320962889, + "loss": 2.805, + "theoretical_loss": 3.4976395100859636, + "tokens_seen": 1592733696 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261334002006018, + "loss": 2.8828, + "theoretical_loss": 3.497626966896765, + "tokens_seen": 1592799232 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026132397191574726, + "loss": 2.7407, + "theoretical_loss": 3.4976144243681464, + "tokens_seen": 1592864768 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613139418254764, + "loss": 2.8673, + "theoretical_loss": 3.497601882500046, + "tokens_seen": 1592930304 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002613039117352056, + "loss": 2.781, + "theoretical_loss": 3.4975893412924015, + "tokens_seen": 1592995840 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002612938816449348, + "loss": 2.7121, + "theoretical_loss": 3.497576800745151, + "tokens_seen": 1593061376 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261283851554664, + "loss": 2.8516, + "theoretical_loss": 3.4975642608582325, + "tokens_seen": 1593126912 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026127382146439316, + "loss": 2.8176, + "theoretical_loss": 3.497551721631585, + "tokens_seen": 1593192448 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002612637913741224, + "loss": 2.8807, + "theoretical_loss": 3.497539183065145, + "tokens_seen": 1593257984 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002612537612838516, + "loss": 2.6396, + "theoretical_loss": 3.4975266451588514, + "tokens_seen": 1593323520 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026124373119358076, + "loss": 2.8304, + "theoretical_loss": 3.4975141079126426, + "tokens_seen": 1593389056 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026123370110330994, + "loss": 2.8379, + "theoretical_loss": 3.497501571326456, + "tokens_seen": 1593454592 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002612236710130391, + "loss": 2.6807, + "theoretical_loss": 3.497489035400231, + "tokens_seen": 1593520128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026121364092276836, + "loss": 2.7213, + "theoretical_loss": 3.497476500133904, + "tokens_seen": 1593585664 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002612036108324975, + "loss": 2.95, + "theoretical_loss": 3.4974639655274142, + "tokens_seen": 1593651200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002611935807422267, + "loss": 2.8264, + "theoretical_loss": 3.497451431580699, + "tokens_seen": 1593716736 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026118355065195585, + "loss": 2.7196, + "theoretical_loss": 3.497438898293698, + "tokens_seen": 1593782272 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002611735205616851, + "loss": 2.9233, + "theoretical_loss": 3.4974263656663473, + "tokens_seen": 1593847808 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026116349047141426, + "loss": 2.7484, + "theoretical_loss": 3.4974138336985865, + "tokens_seen": 1593913344 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026115346038114344, + "loss": 2.8371, + "theoretical_loss": 3.4974013023903536, + "tokens_seen": 1593978880 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002611434302908726, + "loss": 2.735, + "theoretical_loss": 3.4973887717415866, + "tokens_seen": 1594044416 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2543356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.74835467338562, + "objective/train/theoretical_loss": 3.4973762417522236, + "objective/train/tokens_used": 1614569952, + "theoretical_loss": 3.4973762417522236, + "tokens_seen": 1594109952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026113340020060186, + "loss": 2.794, + "theoretical_loss": 3.4973762417522236, + "tokens_seen": 1594109952 + }, + { + "epoch": 4.07, + "learning_rate": 0.000261123370110331, + "loss": 2.7434, + "theoretical_loss": 3.4973637124222026, + "tokens_seen": 1594175488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002611133400200602, + "loss": 2.769, + "theoretical_loss": 3.497351183751462, + "tokens_seen": 1594241024 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026110330992978935, + "loss": 2.6808, + "theoretical_loss": 3.4973386557399406, + "tokens_seen": 1594306560 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610932798395186, + "loss": 2.8195, + "theoretical_loss": 3.4973261283875754, + "tokens_seen": 1594372096 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026108324974924777, + "loss": 2.888, + "theoretical_loss": 3.497313601694306, + "tokens_seen": 1594437632 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026107321965897695, + "loss": 2.7384, + "theoretical_loss": 3.4973010756600695, + "tokens_seen": 1594503168 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026106318956870613, + "loss": 2.8041, + "theoretical_loss": 3.4972885502848046, + "tokens_seen": 1594568704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610531594784353, + "loss": 2.7831, + "theoretical_loss": 3.49727602556845, + "tokens_seen": 1594634240 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610431293881645, + "loss": 2.682, + "theoretical_loss": 3.4972635015109423, + "tokens_seen": 1594699776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610330992978937, + "loss": 2.7419, + "theoretical_loss": 3.497250978112222, + "tokens_seen": 1594765312 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026102306920762285, + "loss": 2.8696, + "theoretical_loss": 3.497238455372226, + "tokens_seen": 1594830848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610130391173521, + "loss": 2.9016, + "theoretical_loss": 3.497225933290893, + "tokens_seen": 1594896384 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002610030090270812, + "loss": 2.8932, + "theoretical_loss": 3.4972134118681613, + "tokens_seen": 1594961920 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026099297893681045, + "loss": 2.744, + "theoretical_loss": 3.4972008911039687, + "tokens_seen": 1595027456 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026098294884653963, + "loss": 2.9434, + "theoretical_loss": 3.4971883709982543, + "tokens_seen": 1595092992 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002609729187562688, + "loss": 2.9945, + "theoretical_loss": 3.4971758515509563, + "tokens_seen": 1595158528 + }, + { + "epoch": 4.07, + "learning_rate": 0.000260962888665998, + "loss": 2.791, + "theoretical_loss": 3.497163332762012, + "tokens_seen": 1595224064 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026095285857572723, + "loss": 2.7449, + "theoretical_loss": 3.497150814631361, + "tokens_seen": 1595289600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026094282848545636, + "loss": 2.6425, + "theoretical_loss": 3.4971382971589415, + "tokens_seen": 1595355136 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002609327983951856, + "loss": 2.6982, + "theoretical_loss": 3.4971257803446907, + "tokens_seen": 1595420672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002609227683049147, + "loss": 2.8601, + "theoretical_loss": 3.497113264188548, + "tokens_seen": 1595486208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026091273821464395, + "loss": 2.7097, + "theoretical_loss": 3.497100748690452, + "tokens_seen": 1595551744 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026090270812437313, + "loss": 2.737, + "theoretical_loss": 3.49708823385034, + "tokens_seen": 1595617280 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608926780341023, + "loss": 2.8657, + "theoretical_loss": 3.4970757196681515, + "tokens_seen": 1595682816 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2546171, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.634852170944214, + "objective/train/theoretical_loss": 3.497063206143824, + "objective/train/tokens_used": 1616208352, + "theoretical_loss": 3.497063206143824, + "tokens_seen": 1595748352 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608826479438315, + "loss": 2.8449, + "theoretical_loss": 3.497063206143824, + "tokens_seen": 1595748352 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608726178535607, + "loss": 2.9195, + "theoretical_loss": 3.4970506932772967, + "tokens_seen": 1595813888 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026086258776328986, + "loss": 2.9636, + "theoretical_loss": 3.4970381810685067, + "tokens_seen": 1595879424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608525576730191, + "loss": 2.641, + "theoretical_loss": 3.4970256695173942, + "tokens_seen": 1595944960 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608425275827482, + "loss": 2.8509, + "theoretical_loss": 3.4970131586238966, + "tokens_seen": 1596010496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026083249749247746, + "loss": 2.7327, + "theoretical_loss": 3.497000648387952, + "tokens_seen": 1596076032 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608224674022066, + "loss": 2.7486, + "theoretical_loss": 3.4969881388095, + "tokens_seen": 1596141568 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002608124373119358, + "loss": 2.6129, + "theoretical_loss": 3.4969756298884778, + "tokens_seen": 1596207104 + }, + { + "epoch": 4.07, + "learning_rate": 0.000260802407221665, + "loss": 2.6883, + "theoretical_loss": 3.4969631216248245, + "tokens_seen": 1596272640 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607923771313942, + "loss": 2.8413, + "theoretical_loss": 3.4969506140184787, + "tokens_seen": 1596338176 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026078234704112336, + "loss": 2.6846, + "theoretical_loss": 3.4969381070693784, + "tokens_seen": 1596403712 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607723169508526, + "loss": 2.8923, + "theoretical_loss": 3.4969256007774625, + "tokens_seen": 1596469248 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607622868605817, + "loss": 2.6337, + "theoretical_loss": 3.496913095142669, + "tokens_seen": 1596534784 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026075225677031096, + "loss": 2.9511, + "theoretical_loss": 3.4969005901649375, + "tokens_seen": 1596600320 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607422266800401, + "loss": 2.7814, + "theoretical_loss": 3.4968880858442053, + "tokens_seen": 1596665856 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607321965897693, + "loss": 2.7544, + "theoretical_loss": 3.4968755821804116, + "tokens_seen": 1596731392 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607221664994985, + "loss": 2.7411, + "theoretical_loss": 3.4968630791734943, + "tokens_seen": 1596796928 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002607121364092277, + "loss": 2.6532, + "theoretical_loss": 3.496850576823392, + "tokens_seen": 1596862464 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026070210631895687, + "loss": 2.8451, + "theoretical_loss": 3.4968380751300443, + "tokens_seen": 1596928000 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026069207622868605, + "loss": 2.4348, + "theoretical_loss": 3.4968255740933887, + "tokens_seen": 1596993536 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026068204613841523, + "loss": 2.6852, + "theoretical_loss": 3.4968130737133643, + "tokens_seen": 1597059072 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026067201604814446, + "loss": 2.7837, + "theoretical_loss": 3.4968005739899093, + "tokens_seen": 1597124608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002606619859578736, + "loss": 2.9291, + "theoretical_loss": 3.4967880749229625, + "tokens_seen": 1597190144 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002606519558676028, + "loss": 2.815, + "theoretical_loss": 3.496775576512462, + "tokens_seen": 1597255680 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026064192577733195, + "loss": 2.7117, + "theoretical_loss": 3.496763078758347, + "tokens_seen": 1597321216 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2547507, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9556474685668945, + "objective/train/theoretical_loss": 3.4967505816605557, + "objective/train/tokens_used": 1617846752, + "theoretical_loss": 3.4967505816605557, + "tokens_seen": 1597386752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002606318956870612, + "loss": 2.9361, + "theoretical_loss": 3.4967505816605557, + "tokens_seen": 1597386752 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026062186559679037, + "loss": 2.9189, + "theoretical_loss": 3.4967380852190275, + "tokens_seen": 1597452288 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026061183550651955, + "loss": 2.8589, + "theoretical_loss": 3.4967255894337, + "tokens_seen": 1597517824 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026060180541624873, + "loss": 2.7088, + "theoretical_loss": 3.4967130943045124, + "tokens_seen": 1597583360 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026059177532597797, + "loss": 2.6845, + "theoretical_loss": 3.496700599831403, + "tokens_seen": 1597648896 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002605817452357071, + "loss": 2.8754, + "theoretical_loss": 3.496688106014311, + "tokens_seen": 1597714432 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026057171514543633, + "loss": 2.6588, + "theoretical_loss": 3.496675612853174, + "tokens_seen": 1597779968 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026056168505516546, + "loss": 2.7367, + "theoretical_loss": 3.496663120347932, + "tokens_seen": 1597845504 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002605516549648947, + "loss": 2.6231, + "theoretical_loss": 3.4966506284985224, + "tokens_seen": 1597911040 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026054162487462387, + "loss": 2.8344, + "theoretical_loss": 3.4966381373048847, + "tokens_seen": 1597976576 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026053159478435305, + "loss": 2.7378, + "theoretical_loss": 3.4966256467669576, + "tokens_seen": 1598042112 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026052156469408223, + "loss": 2.6702, + "theoretical_loss": 3.4966131568846794, + "tokens_seen": 1598107648 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002605115346038114, + "loss": 2.6951, + "theoretical_loss": 3.4966006676579884, + "tokens_seen": 1598173184 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026050150451354065, + "loss": 2.6521, + "theoretical_loss": 3.4965881790868245, + "tokens_seen": 1598238720 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026049147442326983, + "loss": 2.8647, + "theoretical_loss": 3.4965756911711257, + "tokens_seen": 1598304256 + }, + { + "epoch": 4.07, + "learning_rate": 0.000260481444332999, + "loss": 2.6589, + "theoretical_loss": 3.4965632039108305, + "tokens_seen": 1598369792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002604714142427282, + "loss": 2.7384, + "theoretical_loss": 3.496550717305878, + "tokens_seen": 1598435328 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026046138415245743, + "loss": 2.7302, + "theoretical_loss": 3.496538231356207, + "tokens_seen": 1598500864 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026045135406218656, + "loss": 2.7559, + "theoretical_loss": 3.4965257460617556, + "tokens_seen": 1598566400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002604413239719158, + "loss": 2.7271, + "theoretical_loss": 3.4965132614224634, + "tokens_seen": 1598631936 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002604312938816449, + "loss": 2.8665, + "theoretical_loss": 3.4965007774382686, + "tokens_seen": 1598697472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026042126379137415, + "loss": 2.874, + "theoretical_loss": 3.4964882941091107, + "tokens_seen": 1598763008 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026041123370110333, + "loss": 2.7209, + "theoretical_loss": 3.4964758114349275, + "tokens_seen": 1598828544 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002604012036108325, + "loss": 2.8295, + "theoretical_loss": 3.4964633294156586, + "tokens_seen": 1598894080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603911735205617, + "loss": 2.8451, + "theoretical_loss": 3.4964508480512424, + "tokens_seen": 1598959616 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2550296, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8697376251220703, + "objective/train/theoretical_loss": 3.4964383673416175, + "objective/train/tokens_used": 1619485152, + "theoretical_loss": 3.4964383673416175, + "tokens_seen": 1599025152 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603811434302909, + "loss": 2.8289, + "theoretical_loss": 3.4964383673416175, + "tokens_seen": 1599025152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026037111334002006, + "loss": 2.8665, + "theoretical_loss": 3.4964258872867235, + "tokens_seen": 1599090688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603610832497493, + "loss": 2.7373, + "theoretical_loss": 3.4964134078864983, + "tokens_seen": 1599156224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603510531594784, + "loss": 2.894, + "theoretical_loss": 3.496400929140881, + "tokens_seen": 1599221760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026034102306920766, + "loss": 2.8515, + "theoretical_loss": 3.496388451049811, + "tokens_seen": 1599287296 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603309929789368, + "loss": 2.764, + "theoretical_loss": 3.496375973613226, + "tokens_seen": 1599352832 + }, + { + "epoch": 4.07, + "learning_rate": 0.000260320962888666, + "loss": 2.6496, + "theoretical_loss": 3.4963634968310666, + "tokens_seen": 1599418368 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603109327983952, + "loss": 2.8042, + "theoretical_loss": 3.49635102070327, + "tokens_seen": 1599483904 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002603009027081244, + "loss": 2.8411, + "theoretical_loss": 3.4963385452297757, + "tokens_seen": 1599549440 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026029087261785356, + "loss": 2.7244, + "theoretical_loss": 3.4963260704105226, + "tokens_seen": 1599614976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602808425275828, + "loss": 2.9228, + "theoretical_loss": 3.4963135962454497, + "tokens_seen": 1599680512 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602708124373119, + "loss": 2.6116, + "theoretical_loss": 3.4963011227344953, + "tokens_seen": 1599746048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026026078234704116, + "loss": 2.7805, + "theoretical_loss": 3.4962886498775996, + "tokens_seen": 1599811584 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602507522567703, + "loss": 2.7913, + "theoretical_loss": 3.4962761776747002, + "tokens_seen": 1599877120 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602407221664995, + "loss": 2.9015, + "theoretical_loss": 3.4962637061257364, + "tokens_seen": 1599942656 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602306920762287, + "loss": 2.8265, + "theoretical_loss": 3.496251235230648, + "tokens_seen": 1600008192 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002602206619859579, + "loss": 2.7572, + "theoretical_loss": 3.4962387649893722, + "tokens_seen": 1600073728 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026021063189568707, + "loss": 2.8347, + "theoretical_loss": 3.4962262954018497, + "tokens_seen": 1600139264 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026020060180541625, + "loss": 2.8963, + "theoretical_loss": 3.4962138264680185, + "tokens_seen": 1600204800 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026019057171514543, + "loss": 2.7834, + "theoretical_loss": 3.4962013581878173, + "tokens_seen": 1600270336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026018054162487466, + "loss": 2.8803, + "theoretical_loss": 3.4961888905611858, + "tokens_seen": 1600335872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002601705115346038, + "loss": 2.711, + "theoretical_loss": 3.4961764235880626, + "tokens_seen": 1600401408 + }, + { + "epoch": 4.07, + "learning_rate": 0.000260160481444333, + "loss": 2.9388, + "theoretical_loss": 3.496163957268387, + "tokens_seen": 1600466944 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026015045135406215, + "loss": 2.7806, + "theoretical_loss": 3.4961514916020975, + "tokens_seen": 1600532480 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002601404212637914, + "loss": 2.8083, + "theoretical_loss": 3.4961390265891334, + "tokens_seen": 1600598016 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2552663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8925702571868896, + "objective/train/theoretical_loss": 3.496126562229434, + "objective/train/tokens_used": 1621123552, + "theoretical_loss": 3.496126562229434, + "tokens_seen": 1600663552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026013039117352057, + "loss": 2.8537, + "theoretical_loss": 3.496126562229434, + "tokens_seen": 1600663552 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026012036108324975, + "loss": 2.8865, + "theoretical_loss": 3.4961140985229378, + "tokens_seen": 1600729088 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026011033099297893, + "loss": 2.7337, + "theoretical_loss": 3.496101635469584, + "tokens_seen": 1600794624 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026010030090270817, + "loss": 2.6072, + "theoretical_loss": 3.4960891730693113, + "tokens_seen": 1600860160 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002600902708124373, + "loss": 2.6876, + "theoretical_loss": 3.4960767113220594, + "tokens_seen": 1600925696 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026008024072216653, + "loss": 2.8096, + "theoretical_loss": 3.4960642502277675, + "tokens_seen": 1600991232 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026007021063189566, + "loss": 2.6996, + "theoretical_loss": 3.4960517897863737, + "tokens_seen": 1601056768 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002600601805416249, + "loss": 2.73, + "theoretical_loss": 3.496039329997818, + "tokens_seen": 1601122304 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026005015045135407, + "loss": 2.6846, + "theoretical_loss": 3.4960268708620386, + "tokens_seen": 1601187840 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026004012036108325, + "loss": 2.7654, + "theoretical_loss": 3.496014412378975, + "tokens_seen": 1601253376 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026003009027081243, + "loss": 2.757, + "theoretical_loss": 3.496001954548567, + "tokens_seen": 1601318912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002600200601805416, + "loss": 2.8619, + "theoretical_loss": 3.4959894973707524, + "tokens_seen": 1601384448 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002600100300902708, + "loss": 2.7777, + "theoretical_loss": 3.4959770408454713, + "tokens_seen": 1601449984 + }, + { + "epoch": 4.07, + "learning_rate": 0.00026000000000000003, + "loss": 2.7683, + "theoretical_loss": 3.495964584972662, + "tokens_seen": 1601515520 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025998996990972916, + "loss": 2.8738, + "theoretical_loss": 3.495952129752265, + "tokens_seen": 1601581056 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002599799398194584, + "loss": 2.8476, + "theoretical_loss": 3.4959396751842178, + "tokens_seen": 1601646592 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002599699097291876, + "loss": 2.9043, + "theoretical_loss": 3.4959272212684604, + "tokens_seen": 1601712128 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025995987963891676, + "loss": 2.8035, + "theoretical_loss": 3.495914768004932, + "tokens_seen": 1601777664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025994984954864594, + "loss": 2.7163, + "theoretical_loss": 3.4959023153935718, + "tokens_seen": 1601843200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002599398194583751, + "loss": 2.6842, + "theoretical_loss": 3.4958898634343183, + "tokens_seen": 1601908736 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002599297893681043, + "loss": 2.6917, + "theoretical_loss": 3.4958774121271112, + "tokens_seen": 1601974272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025991975927783353, + "loss": 2.8045, + "theoretical_loss": 3.4958649614718897, + "tokens_seen": 1602039808 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025990972918756266, + "loss": 2.8033, + "theoretical_loss": 3.495852511468593, + "tokens_seen": 1602105344 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002598996990972919, + "loss": 2.854, + "theoretical_loss": 3.4958400621171597, + "tokens_seen": 1602170880 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259889669007021, + "loss": 2.6422, + "theoretical_loss": 3.49582761341753, + "tokens_seen": 1602236416 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2555521, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.117548942565918, + "objective/train/theoretical_loss": 3.4958151653696428, + "objective/train/tokens_used": 1622761952, + "theoretical_loss": 3.4958151653696428, + "tokens_seen": 1602301952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025987963891675026, + "loss": 2.8836, + "theoretical_loss": 3.4958151653696428, + "tokens_seen": 1602301952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025986960882647944, + "loss": 2.8586, + "theoretical_loss": 3.4958027179734366, + "tokens_seen": 1602367488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002598595787362086, + "loss": 2.7664, + "theoretical_loss": 3.4957902712288513, + "tokens_seen": 1602433024 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002598495486459378, + "loss": 2.7445, + "theoretical_loss": 3.495777825135826, + "tokens_seen": 1602498560 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259839518555667, + "loss": 2.7953, + "theoretical_loss": 3.4957653796943005, + "tokens_seen": 1602564096 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025982948846539616, + "loss": 2.8778, + "theoretical_loss": 3.4957529349042127, + "tokens_seen": 1602629632 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002598194583751254, + "loss": 2.8766, + "theoretical_loss": 3.4957404907655034, + "tokens_seen": 1602695168 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002598094282848545, + "loss": 2.7109, + "theoretical_loss": 3.4957280472781105, + "tokens_seen": 1602760704 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025979939819458376, + "loss": 2.5749, + "theoretical_loss": 3.4957156044419744, + "tokens_seen": 1602826240 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025978936810431294, + "loss": 2.6951, + "theoretical_loss": 3.4957031622570334, + "tokens_seen": 1602891776 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597793380140421, + "loss": 2.7216, + "theoretical_loss": 3.4956907207232284, + "tokens_seen": 1602957312 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597693079237713, + "loss": 2.8035, + "theoretical_loss": 3.4956782798404964, + "tokens_seen": 1603022848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597592778335005, + "loss": 2.6655, + "theoretical_loss": 3.4956658396087787, + "tokens_seen": 1603088384 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597492477432297, + "loss": 2.7372, + "theoretical_loss": 3.4956534000280137, + "tokens_seen": 1603153920 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597392176529589, + "loss": 2.6821, + "theoretical_loss": 3.4956409610981405, + "tokens_seen": 1603219456 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002597291875626881, + "loss": 2.7597, + "theoretical_loss": 3.4956285228190995, + "tokens_seen": 1603284992 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025971915747241727, + "loss": 2.8043, + "theoretical_loss": 3.4956160851908287, + "tokens_seen": 1603350528 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025970912738214645, + "loss": 2.8002, + "theoretical_loss": 3.495603648213269, + "tokens_seen": 1603416064 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025969909729187563, + "loss": 2.7182, + "theoretical_loss": 3.495591211886358, + "tokens_seen": 1603481600 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025968906720160486, + "loss": 2.979, + "theoretical_loss": 3.4955787762100363, + "tokens_seen": 1603547136 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259679037111334, + "loss": 2.7486, + "theoretical_loss": 3.495566341184243, + "tokens_seen": 1603612672 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002596690070210632, + "loss": 2.8405, + "theoretical_loss": 3.4955539068089174, + "tokens_seen": 1603678208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025965897693079235, + "loss": 2.8482, + "theoretical_loss": 3.495541473083999, + "tokens_seen": 1603743744 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002596489468405216, + "loss": 2.7332, + "theoretical_loss": 3.495529040009427, + "tokens_seen": 1603809280 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025963891675025077, + "loss": 2.739, + "theoretical_loss": 3.4955166075851407, + "tokens_seen": 1603874816 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2558144, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6197457313537598, + "objective/train/theoretical_loss": 3.4955041758110803, + "objective/train/tokens_used": 1624400352, + "theoretical_loss": 3.4955041758110803, + "tokens_seen": 1603940352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025962888665997995, + "loss": 2.7755, + "theoretical_loss": 3.4955041758110803, + "tokens_seen": 1603940352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025961885656970913, + "loss": 2.7545, + "theoretical_loss": 3.4954917446871843, + "tokens_seen": 1604005888 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025960882647943837, + "loss": 2.7935, + "theoretical_loss": 3.4954793142133926, + "tokens_seen": 1604071424 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002595987963891675, + "loss": 2.8607, + "theoretical_loss": 3.4954668843896446, + "tokens_seen": 1604136960 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025958876629889673, + "loss": 2.8136, + "theoretical_loss": 3.4954544552158797, + "tokens_seen": 1604202496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025957873620862586, + "loss": 2.7121, + "theoretical_loss": 3.495442026692037, + "tokens_seen": 1604268032 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002595687061183551, + "loss": 2.7477, + "theoretical_loss": 3.495429598818057, + "tokens_seen": 1604333568 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025955867602808427, + "loss": 2.7183, + "theoretical_loss": 3.495417171593878, + "tokens_seen": 1604399104 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025954864593781345, + "loss": 2.821, + "theoretical_loss": 3.49540474501944, + "tokens_seen": 1604464640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025953861584754263, + "loss": 2.8231, + "theoretical_loss": 3.495392319094683, + "tokens_seen": 1604530176 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002595285857572718, + "loss": 2.8439, + "theoretical_loss": 3.4953798938195453, + "tokens_seen": 1604595712 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259518555667001, + "loss": 2.6682, + "theoretical_loss": 3.4953674691939676, + "tokens_seen": 1604661248 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025950852557673023, + "loss": 2.7111, + "theoretical_loss": 3.495355045217889, + "tokens_seen": 1604726784 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025949849548645936, + "loss": 2.8154, + "theoretical_loss": 3.4953426218912487, + "tokens_seen": 1604792320 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002594884653961886, + "loss": 2.7611, + "theoretical_loss": 3.4953301992139862, + "tokens_seen": 1604857856 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002594784353059178, + "loss": 2.821, + "theoretical_loss": 3.4953177771860418, + "tokens_seen": 1604923392 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025946840521564696, + "loss": 2.8393, + "theoretical_loss": 3.4953053558073544, + "tokens_seen": 1604988928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025945837512537614, + "loss": 2.6712, + "theoretical_loss": 3.495292935077863, + "tokens_seen": 1605054464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002594483450351053, + "loss": 2.869, + "theoretical_loss": 3.495280514997509, + "tokens_seen": 1605120000 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002594383149448345, + "loss": 2.8035, + "theoretical_loss": 3.4952680955662303, + "tokens_seen": 1605185536 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025942828485456373, + "loss": 2.7851, + "theoretical_loss": 3.495255676783967, + "tokens_seen": 1605251072 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025941825476429286, + "loss": 2.7089, + "theoretical_loss": 3.4952432586506585, + "tokens_seen": 1605316608 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002594082246740221, + "loss": 2.7377, + "theoretical_loss": 3.4952308411662454, + "tokens_seen": 1605382144 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002593981945837512, + "loss": 2.6663, + "theoretical_loss": 3.495218424330666, + "tokens_seen": 1605447680 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025938816449348046, + "loss": 2.7429, + "theoretical_loss": 3.4952060081438603, + "tokens_seen": 1605513216 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2560974, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6399900913238525, + "objective/train/theoretical_loss": 3.4951935926057685, + "objective/train/tokens_used": 1626038752, + "theoretical_loss": 3.4951935926057685, + "tokens_seen": 1605578752 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025937813440320964, + "loss": 2.7267, + "theoretical_loss": 3.4951935926057685, + "tokens_seen": 1605578752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002593681043129388, + "loss": 2.682, + "theoretical_loss": 3.4951811777163293, + "tokens_seen": 1605644288 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259358074222668, + "loss": 2.8468, + "theoretical_loss": 3.495168763475483, + "tokens_seen": 1605709824 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002593480441323972, + "loss": 2.8627, + "theoretical_loss": 3.4951563498831693, + "tokens_seen": 1605775360 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025933801404212636, + "loss": 2.8278, + "theoretical_loss": 3.4951439369393276, + "tokens_seen": 1605840896 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002593279839518556, + "loss": 2.7493, + "theoretical_loss": 3.495131524643897, + "tokens_seen": 1605906432 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002593179538615847, + "loss": 2.8145, + "theoretical_loss": 3.4951191129968184, + "tokens_seen": 1605971968 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025930792377131396, + "loss": 2.7639, + "theoretical_loss": 3.4951067019980306, + "tokens_seen": 1606037504 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025929789368104314, + "loss": 2.795, + "theoretical_loss": 3.4950942916474736, + "tokens_seen": 1606103040 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002592878635907723, + "loss": 2.8749, + "theoretical_loss": 3.495081881945087, + "tokens_seen": 1606168576 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002592778335005015, + "loss": 2.7559, + "theoretical_loss": 3.4950694728908105, + "tokens_seen": 1606234112 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002592678034102307, + "loss": 2.7567, + "theoretical_loss": 3.495057064484584, + "tokens_seen": 1606299648 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025925777331995987, + "loss": 2.8952, + "theoretical_loss": 3.4950446567263462, + "tokens_seen": 1606365184 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002592477432296891, + "loss": 2.7728, + "theoretical_loss": 3.495032249616039, + "tokens_seen": 1606430720 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025923771313941823, + "loss": 2.7576, + "theoretical_loss": 3.4950198431536, + "tokens_seen": 1606496256 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025922768304914747, + "loss": 2.6701, + "theoretical_loss": 3.4950074373389697, + "tokens_seen": 1606561792 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002592176529588766, + "loss": 2.7916, + "theoretical_loss": 3.4949950321720884, + "tokens_seen": 1606627328 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025920762286860583, + "loss": 2.6154, + "theoretical_loss": 3.494982627652895, + "tokens_seen": 1606692864 + }, + { + "epoch": 4.07, + "learning_rate": 0.000259197592778335, + "loss": 2.7519, + "theoretical_loss": 3.4949702237813294, + "tokens_seen": 1606758400 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002591875626880642, + "loss": 2.849, + "theoretical_loss": 3.4949578205573317, + "tokens_seen": 1606823936 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025917753259779337, + "loss": 2.7797, + "theoretical_loss": 3.494945417980842, + "tokens_seen": 1606889472 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025916750250752255, + "loss": 2.8718, + "theoretical_loss": 3.4949330160517995, + "tokens_seen": 1606955008 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025915747241725173, + "loss": 2.8288, + "theoretical_loss": 3.4949206147701437, + "tokens_seen": 1607020544 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025914744232698097, + "loss": 2.8516, + "theoretical_loss": 3.4949082141358154, + "tokens_seen": 1607086080 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002591374122367101, + "loss": 2.7274, + "theoretical_loss": 3.494895814148754, + "tokens_seen": 1607151616 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2563705, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.871995449066162, + "objective/train/theoretical_loss": 3.494883414808899, + "objective/train/tokens_used": 1627677152, + "theoretical_loss": 3.494883414808899, + "tokens_seen": 1607217152 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025912738214643933, + "loss": 2.6467, + "theoretical_loss": 3.494883414808899, + "tokens_seen": 1607217152 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002591173520561685, + "loss": 2.7005, + "theoretical_loss": 3.4948710161161904, + "tokens_seen": 1607282688 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002591073219658977, + "loss": 2.7482, + "theoretical_loss": 3.494858618070568, + "tokens_seen": 1607348224 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002590972918756269, + "loss": 2.8172, + "theoretical_loss": 3.494846220671972, + "tokens_seen": 1607413760 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025908726178535606, + "loss": 2.7154, + "theoretical_loss": 3.4948338239203416, + "tokens_seen": 1607479296 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025907723169508524, + "loss": 2.874, + "theoretical_loss": 3.4948214278156176, + "tokens_seen": 1607544832 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025906720160481447, + "loss": 2.7165, + "theoretical_loss": 3.4948090323577388, + "tokens_seen": 1607610368 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002590571715145436, + "loss": 2.944, + "theoretical_loss": 3.4947966375466457, + "tokens_seen": 1607675904 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025904714142427283, + "loss": 2.616, + "theoretical_loss": 3.4947842433822784, + "tokens_seen": 1607741440 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025903711133400196, + "loss": 2.7491, + "theoretical_loss": 3.4947718498645766, + "tokens_seen": 1607806976 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002590270812437312, + "loss": 2.6894, + "theoretical_loss": 3.4947594569934797, + "tokens_seen": 1607872512 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002590170511534604, + "loss": 2.951, + "theoretical_loss": 3.4947470647689283, + "tokens_seen": 1607938048 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025900702106318956, + "loss": 2.7356, + "theoretical_loss": 3.4947346731908615, + "tokens_seen": 1608003584 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002589969909729188, + "loss": 2.8068, + "theoretical_loss": 3.49472228225922, + "tokens_seen": 1608069120 + }, + { + "epoch": 4.07, + "learning_rate": 0.000258986960882648, + "loss": 2.8175, + "theoretical_loss": 3.494709891973944, + "tokens_seen": 1608134656 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025897693079237716, + "loss": 2.6144, + "theoretical_loss": 3.4946975023349722, + "tokens_seen": 1608200192 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025896690070210634, + "loss": 2.6912, + "theoretical_loss": 3.4946851133422454, + "tokens_seen": 1608265728 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002589568706118355, + "loss": 2.6337, + "theoretical_loss": 3.494672724995704, + "tokens_seen": 1608331264 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002589468405215647, + "loss": 2.5654, + "theoretical_loss": 3.4946603372952865, + "tokens_seen": 1608396800 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025893681043129393, + "loss": 2.6425, + "theoretical_loss": 3.4946479502409344, + "tokens_seen": 1608462336 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025892678034102306, + "loss": 2.7995, + "theoretical_loss": 3.4946355638325874, + "tokens_seen": 1608527872 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002589167502507523, + "loss": 2.6397, + "theoretical_loss": 3.4946231780701846, + "tokens_seen": 1608593408 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002589067201604814, + "loss": 2.7631, + "theoretical_loss": 3.494610792953667, + "tokens_seen": 1608658944 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025889669007021066, + "loss": 2.6247, + "theoretical_loss": 3.4945984084829735, + "tokens_seen": 1608724480 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025888665997993984, + "loss": 2.8744, + "theoretical_loss": 3.4945860246580454, + "tokens_seen": 1608790016 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2564936, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5632665157318115, + "objective/train/theoretical_loss": 3.4945736414788215, + "objective/train/tokens_used": 1629315552, + "theoretical_loss": 3.4945736414788215, + "tokens_seen": 1608855552 + }, + { + "epoch": 4.07, + "learning_rate": 0.000258876629889669, + "loss": 2.7295, + "theoretical_loss": 3.4945736414788215, + "tokens_seen": 1608855552 + }, + { + "epoch": 4.07, + "learning_rate": 0.000258876629889669, + "loss": 2.8163, + "theoretical_loss": 3.494561258945243, + "tokens_seen": 1608921088 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002588665997993982, + "loss": 2.9375, + "theoretical_loss": 3.494548877057249, + "tokens_seen": 1608986624 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002588565697091274, + "loss": 2.6607, + "theoretical_loss": 3.49453649581478, + "tokens_seen": 1609052160 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025884653961885657, + "loss": 2.7533, + "theoretical_loss": 3.494524115217776, + "tokens_seen": 1609117696 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002588365095285858, + "loss": 2.9044, + "theoretical_loss": 3.494511735266177, + "tokens_seen": 1609183232 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025882647943831493, + "loss": 2.7168, + "theoretical_loss": 3.494499355959923, + "tokens_seen": 1609248768 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025881644934804416, + "loss": 2.8987, + "theoretical_loss": 3.494486977298955, + "tokens_seen": 1609314304 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025880641925777334, + "loss": 2.6461, + "theoretical_loss": 3.494474599283212, + "tokens_seen": 1609379840 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587963891675025, + "loss": 2.8819, + "theoretical_loss": 3.4944622219126336, + "tokens_seen": 1609445376 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587863590772317, + "loss": 2.8323, + "theoretical_loss": 3.4944498451871615, + "tokens_seen": 1609510912 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587763289869609, + "loss": 2.7179, + "theoretical_loss": 3.4944374691067344, + "tokens_seen": 1609576448 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025876629889669007, + "loss": 2.6049, + "theoretical_loss": 3.4944250936712935, + "tokens_seen": 1609641984 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587562688064193, + "loss": 2.7057, + "theoretical_loss": 3.494412718880778, + "tokens_seen": 1609707520 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025874623871614843, + "loss": 2.8027, + "theoretical_loss": 3.494400344735129, + "tokens_seen": 1609773056 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025874623871614843, + "loss": 2.763, + "theoretical_loss": 3.4943879712342856, + "tokens_seen": 1609838592 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025873620862587767, + "loss": 2.8459, + "theoretical_loss": 3.4943755983781886, + "tokens_seen": 1609904128 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587261785356068, + "loss": 2.7451, + "theoretical_loss": 3.4943632261667785, + "tokens_seen": 1609969664 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025871614844533603, + "loss": 2.7097, + "theoretical_loss": 3.4943508545999946, + "tokens_seen": 1610035200 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002587061183550652, + "loss": 2.6897, + "theoretical_loss": 3.4943384836777778, + "tokens_seen": 1610100736 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002586960882647944, + "loss": 2.703, + "theoretical_loss": 3.4943261134000676, + "tokens_seen": 1610166272 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025868605817452357, + "loss": 2.8716, + "theoretical_loss": 3.4943137437668046, + "tokens_seen": 1610231808 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025867602808425275, + "loss": 2.8002, + "theoretical_loss": 3.494301374777929, + "tokens_seen": 1610297344 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025866599799398193, + "loss": 2.8338, + "theoretical_loss": 3.494289006433381, + "tokens_seen": 1610362880 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025865596790371117, + "loss": 2.947, + "theoretical_loss": 3.4942766387331003, + "tokens_seen": 1610428416 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2567938, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.803225517272949, + "objective/train/theoretical_loss": 3.4942642716770287, + "objective/train/tokens_used": 1630953952, + "theoretical_loss": 3.4942642716770287, + "tokens_seen": 1610493952 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002586459378134403, + "loss": 2.694, + "theoretical_loss": 3.4942642716770287, + "tokens_seen": 1610493952 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025863590772316953, + "loss": 2.7137, + "theoretical_loss": 3.4942519052651044, + "tokens_seen": 1610559488 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002586258776328987, + "loss": 2.7727, + "theoretical_loss": 3.494239539497269, + "tokens_seen": 1610625024 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002586158475426279, + "loss": 2.5736, + "theoretical_loss": 3.494227174373462, + "tokens_seen": 1610690560 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002586058174523571, + "loss": 2.6947, + "theoretical_loss": 3.4942148098936245, + "tokens_seen": 1610756096 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025859578736208626, + "loss": 2.7399, + "theoretical_loss": 3.494202446057696, + "tokens_seen": 1610821632 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025858575727181544, + "loss": 2.4618, + "theoretical_loss": 3.494190082865617, + "tokens_seen": 1610887168 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025857572718154467, + "loss": 2.74, + "theoretical_loss": 3.494177720317327, + "tokens_seen": 1610952704 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002585656970912738, + "loss": 2.7735, + "theoretical_loss": 3.4941653584127685, + "tokens_seen": 1611018240 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025855566700100303, + "loss": 2.8362, + "theoretical_loss": 3.49415299715188, + "tokens_seen": 1611083776 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025854563691073216, + "loss": 2.7016, + "theoretical_loss": 3.4941406365346017, + "tokens_seen": 1611149312 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002585356068204614, + "loss": 2.751, + "theoretical_loss": 3.4941282765608745, + "tokens_seen": 1611214848 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002585255767301906, + "loss": 2.7433, + "theoretical_loss": 3.4941159172306384, + "tokens_seen": 1611280384 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025851554663991976, + "loss": 2.8462, + "theoretical_loss": 3.4941035585438343, + "tokens_seen": 1611345920 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025850551654964894, + "loss": 2.8959, + "theoretical_loss": 3.494091200500402, + "tokens_seen": 1611411456 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584954864593782, + "loss": 2.851, + "theoretical_loss": 3.494078843100282, + "tokens_seen": 1611476992 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584854563691073, + "loss": 2.9057, + "theoretical_loss": 3.494066486343415, + "tokens_seen": 1611542528 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025847542627883654, + "loss": 2.7585, + "theoretical_loss": 3.494054130229741, + "tokens_seen": 1611608064 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025846539618856566, + "loss": 2.7031, + "theoretical_loss": 3.4940417747592, + "tokens_seen": 1611673600 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584553660982949, + "loss": 2.7621, + "theoretical_loss": 3.494029419931733, + "tokens_seen": 1611739136 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584453360080241, + "loss": 2.7501, + "theoretical_loss": 3.49401706574728, + "tokens_seen": 1611804672 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025843530591775326, + "loss": 2.6769, + "theoretical_loss": 3.494004712205782, + "tokens_seen": 1611870208 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025842527582748244, + "loss": 2.7978, + "theoretical_loss": 3.493992359307178, + "tokens_seen": 1611935744 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584152457372116, + "loss": 2.8706, + "theoretical_loss": 3.4939800070514107, + "tokens_seen": 1612001280 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002584052156469408, + "loss": 2.7099, + "theoretical_loss": 3.493967655438418, + "tokens_seen": 1612066816 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2570818, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7696118354797363, + "objective/train/theoretical_loss": 3.493955304468142, + "objective/train/tokens_used": 1632592352, + "theoretical_loss": 3.493955304468142, + "tokens_seen": 1612132352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025839518555667004, + "loss": 2.8122, + "theoretical_loss": 3.493955304468142, + "tokens_seen": 1612132352 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025838515546639917, + "loss": 2.6918, + "theoretical_loss": 3.4939429541405227, + "tokens_seen": 1612197888 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002583751253761284, + "loss": 2.8354, + "theoretical_loss": 3.4939306044555, + "tokens_seen": 1612263424 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025836509528585753, + "loss": 2.7912, + "theoretical_loss": 3.493918255413015, + "tokens_seen": 1612328960 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025835506519558677, + "loss": 2.8247, + "theoretical_loss": 3.4939059070130085, + "tokens_seen": 1612394496 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025834503510531595, + "loss": 2.8969, + "theoretical_loss": 3.4938935592554197, + "tokens_seen": 1612460032 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025833500501504513, + "loss": 2.6728, + "theoretical_loss": 3.49388121214019, + "tokens_seen": 1612525568 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002583249749247743, + "loss": 2.6647, + "theoretical_loss": 3.4938688656672596, + "tokens_seen": 1612591104 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025831494483450354, + "loss": 2.7786, + "theoretical_loss": 3.4938565198365694, + "tokens_seen": 1612656640 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025830491474423267, + "loss": 2.8049, + "theoretical_loss": 3.4938441746480597, + "tokens_seen": 1612722176 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002582948846539619, + "loss": 2.7599, + "theoretical_loss": 3.49383183010167, + "tokens_seen": 1612787712 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025828485456369103, + "loss": 2.7989, + "theoretical_loss": 3.493819486197342, + "tokens_seen": 1612853248 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025827482447342027, + "loss": 2.8584, + "theoretical_loss": 3.493807142935016, + "tokens_seen": 1612918784 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025826479438314945, + "loss": 2.7723, + "theoretical_loss": 3.4937948003146326, + "tokens_seen": 1612984320 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025825476429287863, + "loss": 2.8265, + "theoretical_loss": 3.493782458336132, + "tokens_seen": 1613049856 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025824473420260787, + "loss": 2.714, + "theoretical_loss": 3.493770116999455, + "tokens_seen": 1613115392 + }, + { + "epoch": 4.07, + "learning_rate": 0.000258234704112337, + "loss": 2.7872, + "theoretical_loss": 3.493757776304542, + "tokens_seen": 1613180928 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025822467402206623, + "loss": 2.7591, + "theoretical_loss": 3.4937454362513334, + "tokens_seen": 1613246464 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002582146439317954, + "loss": 2.6491, + "theoretical_loss": 3.49373309683977, + "tokens_seen": 1613312000 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002582046138415246, + "loss": 2.8516, + "theoretical_loss": 3.4937207580697924, + "tokens_seen": 1613377536 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025819458375125377, + "loss": 2.8201, + "theoretical_loss": 3.493708419941341, + "tokens_seen": 1613443072 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025818455366098295, + "loss": 2.6856, + "theoretical_loss": 3.493696082454357, + "tokens_seen": 1613508608 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025817452357071213, + "loss": 2.7894, + "theoretical_loss": 3.49368374560878, + "tokens_seen": 1613574144 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025816449348044137, + "loss": 2.854, + "theoretical_loss": 3.493671409404551, + "tokens_seen": 1613639680 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002581544633901705, + "loss": 2.8684, + "theoretical_loss": 3.493659073841611, + "tokens_seen": 1613705216 + }, + { + "epoch": 4.07, + "objective/train/docs_used": 2573693, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.807957410812378, + "objective/train/theoretical_loss": 3.4936467389199004, + "objective/train/tokens_used": 1634230752, + "theoretical_loss": 3.4936467389199004, + "tokens_seen": 1613770752 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025814443329989973, + "loss": 2.8227, + "theoretical_loss": 3.4936467389199004, + "tokens_seen": 1613770752 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002581344032096289, + "loss": 2.7573, + "theoretical_loss": 3.4936344046393595, + "tokens_seen": 1613836288 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002581243731193581, + "loss": 2.7919, + "theoretical_loss": 3.4936220709999293, + "tokens_seen": 1613901824 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002581143430290873, + "loss": 2.6425, + "theoretical_loss": 3.4936097380015503, + "tokens_seen": 1613967360 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025810431293881646, + "loss": 2.9438, + "theoretical_loss": 3.493597405644163, + "tokens_seen": 1614032896 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025809428284854564, + "loss": 2.7398, + "theoretical_loss": 3.493585073927709, + "tokens_seen": 1614098432 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025808425275827487, + "loss": 2.5919, + "theoretical_loss": 3.4935727428521277, + "tokens_seen": 1614163968 + }, + { + "epoch": 4.07, + "learning_rate": 0.000258074222668004, + "loss": 2.6176, + "theoretical_loss": 3.4935604124173603, + "tokens_seen": 1614229504 + }, + { + "epoch": 4.07, + "learning_rate": 0.00025806419257773323, + "loss": 2.9092, + "theoretical_loss": 3.4935480826233474, + "tokens_seen": 1614295040 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025805416248746236, + "loss": 2.8879, + "theoretical_loss": 3.49353575347003, + "tokens_seen": 1614360576 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002580441323971916, + "loss": 2.7947, + "theoretical_loss": 3.493523424957348, + "tokens_seen": 1614426112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002580341023069208, + "loss": 2.7002, + "theoretical_loss": 3.4935110970852437, + "tokens_seen": 1614491648 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025802407221664996, + "loss": 2.9393, + "theoretical_loss": 3.493498769853656, + "tokens_seen": 1614557184 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025801404212637914, + "loss": 2.8463, + "theoretical_loss": 3.493486443262527, + "tokens_seen": 1614622720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002580040120361084, + "loss": 2.7314, + "theoretical_loss": 3.4934741173117967, + "tokens_seen": 1614688256 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002579939819458375, + "loss": 2.7277, + "theoretical_loss": 3.4934617920014057, + "tokens_seen": 1614753792 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025798395185556674, + "loss": 2.8255, + "theoretical_loss": 3.4934494673312955, + "tokens_seen": 1614819328 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025797392176529586, + "loss": 2.6907, + "theoretical_loss": 3.4934371433014055, + "tokens_seen": 1614884864 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002579638916750251, + "loss": 2.7448, + "theoretical_loss": 3.493424819911678, + "tokens_seen": 1614950400 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002579538615847543, + "loss": 2.7578, + "theoretical_loss": 3.493412497162053, + "tokens_seen": 1615015936 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025794383149448346, + "loss": 2.685, + "theoretical_loss": 3.493400175052472, + "tokens_seen": 1615081472 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025793380140421264, + "loss": 2.7104, + "theoretical_loss": 3.493387853582875, + "tokens_seen": 1615147008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002579237713139418, + "loss": 2.8164, + "theoretical_loss": 3.493375532753203, + "tokens_seen": 1615212544 + }, + { + "epoch": 4.08, + "learning_rate": 0.000257913741223671, + "loss": 2.7422, + "theoretical_loss": 3.4933632125633958, + "tokens_seen": 1615278080 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025790371113340024, + "loss": 2.7399, + "theoretical_loss": 3.493350893013396, + "tokens_seen": 1615343616 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2576581, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5599517822265625, + "objective/train/theoretical_loss": 3.4933385741031437, + "objective/train/tokens_used": 1635869152, + "theoretical_loss": 3.4933385741031437, + "tokens_seen": 1615409152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025789368104312937, + "loss": 2.8345, + "theoretical_loss": 3.4933385741031437, + "tokens_seen": 1615409152 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002578836509528586, + "loss": 2.7735, + "theoretical_loss": 3.4933262558325797, + "tokens_seen": 1615474688 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025787362086258773, + "loss": 2.7136, + "theoretical_loss": 3.4933139382016445, + "tokens_seen": 1615540224 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025786359077231697, + "loss": 2.7654, + "theoretical_loss": 3.4933016212102794, + "tokens_seen": 1615605760 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025785356068204615, + "loss": 2.8211, + "theoretical_loss": 3.493289304858425, + "tokens_seen": 1615671296 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025784353059177533, + "loss": 2.82, + "theoretical_loss": 3.4932769891460222, + "tokens_seen": 1615736832 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002578335005015045, + "loss": 2.6927, + "theoretical_loss": 3.493264674073012, + "tokens_seen": 1615802368 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025782347041123374, + "loss": 2.8135, + "theoretical_loss": 3.4932523596393352, + "tokens_seen": 1615867904 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025781344032096287, + "loss": 2.5452, + "theoretical_loss": 3.4932400458449324, + "tokens_seen": 1615933440 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002578034102306921, + "loss": 2.8287, + "theoretical_loss": 3.4932277326897454, + "tokens_seen": 1615998976 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025779338014042123, + "loss": 2.8566, + "theoretical_loss": 3.493215420173714, + "tokens_seen": 1616064512 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025778335005015047, + "loss": 2.7632, + "theoretical_loss": 3.4932031082967794, + "tokens_seen": 1616130048 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025777331995987965, + "loss": 2.7333, + "theoretical_loss": 3.4931907970588827, + "tokens_seen": 1616195584 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025776328986960883, + "loss": 2.6391, + "theoretical_loss": 3.493178486459965, + "tokens_seen": 1616261120 + }, + { + "epoch": 4.08, + "learning_rate": 0.000257753259779338, + "loss": 2.5869, + "theoretical_loss": 3.493166176499967, + "tokens_seen": 1616326656 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002577432296890672, + "loss": 2.8982, + "theoretical_loss": 3.4931538671788296, + "tokens_seen": 1616392192 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002577331995987964, + "loss": 2.8759, + "theoretical_loss": 3.4931415584964935, + "tokens_seen": 1616457728 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002577231695085256, + "loss": 2.8758, + "theoretical_loss": 3.4931292504529003, + "tokens_seen": 1616523264 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025771313941825474, + "loss": 2.792, + "theoretical_loss": 3.4931169430479905, + "tokens_seen": 1616588800 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025770310932798397, + "loss": 2.7756, + "theoretical_loss": 3.4931046362817053, + "tokens_seen": 1616654336 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576930792377131, + "loss": 2.8038, + "theoretical_loss": 3.4930923301539853, + "tokens_seen": 1616719872 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025768304914744233, + "loss": 2.7299, + "theoretical_loss": 3.4930800246647715, + "tokens_seen": 1616785408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576730190571715, + "loss": 2.7463, + "theoretical_loss": 3.4930677198140057, + "tokens_seen": 1616850944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576629889669007, + "loss": 2.7813, + "theoretical_loss": 3.4930554156016282, + "tokens_seen": 1616916480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576529588766299, + "loss": 2.8889, + "theoretical_loss": 3.49304311202758, + "tokens_seen": 1616982016 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2578753, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0682246685028076, + "objective/train/theoretical_loss": 3.4930308090918025, + "objective/train/tokens_used": 1637507552, + "theoretical_loss": 3.4930308090918025, + "tokens_seen": 1617047552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576429287863591, + "loss": 2.8586, + "theoretical_loss": 3.4930308090918025, + "tokens_seen": 1617047552 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025763289869608824, + "loss": 2.8658, + "theoretical_loss": 3.493018506794236, + "tokens_seen": 1617113088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576228686058175, + "loss": 2.7796, + "theoretical_loss": 3.4930062051348223, + "tokens_seen": 1617178624 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002576128385155466, + "loss": 2.6726, + "theoretical_loss": 3.4929939041135016, + "tokens_seen": 1617244160 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025760280842527584, + "loss": 2.7476, + "theoretical_loss": 3.4929816037302164, + "tokens_seen": 1617309696 + }, + { + "epoch": 4.08, + "learning_rate": 0.000257592778335005, + "loss": 2.8364, + "theoretical_loss": 3.492969303984906, + "tokens_seen": 1617375232 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002575827482447342, + "loss": 2.952, + "theoretical_loss": 3.492957004877513, + "tokens_seen": 1617440768 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002575727181544634, + "loss": 2.7802, + "theoretical_loss": 3.492944706407977, + "tokens_seen": 1617506304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025756268806419256, + "loss": 2.6668, + "theoretical_loss": 3.4929324085762405, + "tokens_seen": 1617571840 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025755265797392174, + "loss": 2.7409, + "theoretical_loss": 3.4929201113822437, + "tokens_seen": 1617637376 + }, + { + "epoch": 4.08, + "learning_rate": 0.000257542627883651, + "loss": 2.7622, + "theoretical_loss": 3.492907814825928, + "tokens_seen": 1617702912 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002575325977933801, + "loss": 2.8213, + "theoretical_loss": 3.4928955189072344, + "tokens_seen": 1617768448 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025752256770310934, + "loss": 2.902, + "theoretical_loss": 3.492883223626104, + "tokens_seen": 1617833984 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025751253761283847, + "loss": 2.8191, + "theoretical_loss": 3.4928709289824775, + "tokens_seen": 1617899520 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002575025075225677, + "loss": 2.7664, + "theoretical_loss": 3.4928586349762973, + "tokens_seen": 1617965056 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025749247743229694, + "loss": 2.6919, + "theoretical_loss": 3.4928463416075033, + "tokens_seen": 1618030592 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025748244734202606, + "loss": 2.8378, + "theoretical_loss": 3.4928340488760368, + "tokens_seen": 1618096128 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002574724172517553, + "loss": 2.9043, + "theoretical_loss": 3.492821756781839, + "tokens_seen": 1618161664 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002574623871614845, + "loss": 2.6943, + "theoretical_loss": 3.4928094653248523, + "tokens_seen": 1618227200 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025745235707121366, + "loss": 2.7859, + "theoretical_loss": 3.492797174505016, + "tokens_seen": 1618292736 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025744232698094284, + "loss": 2.9455, + "theoretical_loss": 3.492784884322272, + "tokens_seen": 1618358272 + }, + { + "epoch": 4.08, + "learning_rate": 0.000257432296890672, + "loss": 2.5701, + "theoretical_loss": 3.4927725947765618, + "tokens_seen": 1618423808 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002574222668004012, + "loss": 2.6373, + "theoretical_loss": 3.4927603058678267, + "tokens_seen": 1618489344 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025741223671013044, + "loss": 2.9733, + "theoretical_loss": 3.492748017596007, + "tokens_seen": 1618554880 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025740220661985957, + "loss": 2.817, + "theoretical_loss": 3.492735729961045, + "tokens_seen": 1618620416 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2581501, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8129847049713135, + "objective/train/theoretical_loss": 3.492723442962881, + "objective/train/tokens_used": 1639145952, + "theoretical_loss": 3.492723442962881, + "tokens_seen": 1618685952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002573921765295888, + "loss": 2.7887, + "theoretical_loss": 3.492723442962881, + "tokens_seen": 1618685952 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025738214643931793, + "loss": 2.6773, + "theoretical_loss": 3.492711156601456, + "tokens_seen": 1618751488 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025737211634904717, + "loss": 2.7583, + "theoretical_loss": 3.492698870876713, + "tokens_seen": 1618817024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025736208625877635, + "loss": 2.6397, + "theoretical_loss": 3.4926865857885914, + "tokens_seen": 1618882560 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025735205616850553, + "loss": 2.8186, + "theoretical_loss": 3.4926743013370327, + "tokens_seen": 1618948096 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002573420260782347, + "loss": 2.8, + "theoretical_loss": 3.492662017521979, + "tokens_seen": 1619013632 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025733199598796394, + "loss": 2.8259, + "theoretical_loss": 3.492649734343371, + "tokens_seen": 1619079168 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025732196589769307, + "loss": 2.7925, + "theoretical_loss": 3.4926374518011505, + "tokens_seen": 1619144704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002573119358074223, + "loss": 2.69, + "theoretical_loss": 3.4926251698952577, + "tokens_seen": 1619210240 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025730190571715143, + "loss": 2.6724, + "theoretical_loss": 3.4926128886256347, + "tokens_seen": 1619275776 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025729187562688067, + "loss": 2.7407, + "theoretical_loss": 3.4926006079922223, + "tokens_seen": 1619341312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025728184553660985, + "loss": 2.6549, + "theoretical_loss": 3.4925883279949623, + "tokens_seen": 1619406848 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025727181544633903, + "loss": 2.7803, + "theoretical_loss": 3.4925760486337962, + "tokens_seen": 1619472384 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002572617853560682, + "loss": 2.7985, + "theoretical_loss": 3.4925637699086645, + "tokens_seen": 1619537920 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002572517552657974, + "loss": 2.7245, + "theoretical_loss": 3.492551491819509, + "tokens_seen": 1619603456 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002572417251755266, + "loss": 2.7763, + "theoretical_loss": 3.492539214366271, + "tokens_seen": 1619668992 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002572316950852558, + "loss": 2.6617, + "theoretical_loss": 3.4925269375488917, + "tokens_seen": 1619734528 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025722166499498494, + "loss": 2.7539, + "theoretical_loss": 3.4925146613673124, + "tokens_seen": 1619800064 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025721163490471417, + "loss": 2.8124, + "theoretical_loss": 3.4925023858214743, + "tokens_seen": 1619865600 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002572016048144433, + "loss": 2.8472, + "theoretical_loss": 3.49249011091132, + "tokens_seen": 1619931136 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025719157472417253, + "loss": 2.7288, + "theoretical_loss": 3.4924778366367892, + "tokens_seen": 1619996672 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571815446339017, + "loss": 2.6993, + "theoretical_loss": 3.492465562997824, + "tokens_seen": 1620062208 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571715145436309, + "loss": 2.8225, + "theoretical_loss": 3.4924532899943657, + "tokens_seen": 1620127744 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571614844533601, + "loss": 2.7719, + "theoretical_loss": 3.4924410176263563, + "tokens_seen": 1620193280 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571514543630893, + "loss": 2.7433, + "theoretical_loss": 3.4924287458937364, + "tokens_seen": 1620258816 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2584455, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.801525354385376, + "objective/train/theoretical_loss": 3.492416474796447, + "objective/train/tokens_used": 1640784352, + "theoretical_loss": 3.492416474796447, + "tokens_seen": 1620324352 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025714142427281844, + "loss": 2.5951, + "theoretical_loss": 3.492416474796447, + "tokens_seen": 1620324352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571313941825477, + "loss": 2.8892, + "theoretical_loss": 3.4924042043344308, + "tokens_seen": 1620389888 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571213640922768, + "loss": 2.6414, + "theoretical_loss": 3.4923919345076286, + "tokens_seen": 1620455424 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025711133400200604, + "loss": 2.7932, + "theoretical_loss": 3.4923796653159815, + "tokens_seen": 1620520960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002571013039117352, + "loss": 2.7743, + "theoretical_loss": 3.4923673967594313, + "tokens_seen": 1620586496 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570912738214644, + "loss": 2.6811, + "theoretical_loss": 3.4923551288379198, + "tokens_seen": 1620652032 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570812437311936, + "loss": 2.9137, + "theoretical_loss": 3.492342861551388, + "tokens_seen": 1620717568 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025707121364092276, + "loss": 2.8355, + "theoretical_loss": 3.492330594899777, + "tokens_seen": 1620783104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025706118355065194, + "loss": 2.6894, + "theoretical_loss": 3.4923183288830284, + "tokens_seen": 1620848640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570511534603812, + "loss": 2.7128, + "theoretical_loss": 3.4923060635010845, + "tokens_seen": 1620914176 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570411233701103, + "loss": 2.9277, + "theoretical_loss": 3.4922937987538862, + "tokens_seen": 1620979712 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025703109327983954, + "loss": 2.7359, + "theoretical_loss": 3.492281534641375, + "tokens_seen": 1621045248 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025702106318956867, + "loss": 2.878, + "theoretical_loss": 3.4922692711634924, + "tokens_seen": 1621110784 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570110330992979, + "loss": 2.6801, + "theoretical_loss": 3.49225700832018, + "tokens_seen": 1621176320 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002570010030090271, + "loss": 2.6951, + "theoretical_loss": 3.4922447461113792, + "tokens_seen": 1621241856 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025699097291875626, + "loss": 2.8102, + "theoretical_loss": 3.4922324845370314, + "tokens_seen": 1621307392 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025698094282848545, + "loss": 2.7399, + "theoretical_loss": 3.4922202235970783, + "tokens_seen": 1621372928 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002569709127382147, + "loss": 2.825, + "theoretical_loss": 3.4922079632914613, + "tokens_seen": 1621438464 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002569608826479438, + "loss": 2.7895, + "theoretical_loss": 3.4921957036201223, + "tokens_seen": 1621504000 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025695085255767304, + "loss": 2.7833, + "theoretical_loss": 3.4921834445830027, + "tokens_seen": 1621569536 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025694082246740217, + "loss": 2.8461, + "theoretical_loss": 3.4921711861800437, + "tokens_seen": 1621635072 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002569307923771314, + "loss": 2.7394, + "theoretical_loss": 3.4921589284111874, + "tokens_seen": 1621700608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002569207622868606, + "loss": 2.8545, + "theoretical_loss": 3.492146671276375, + "tokens_seen": 1621766144 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025691073219658977, + "loss": 2.8261, + "theoretical_loss": 3.492134414775548, + "tokens_seen": 1621831680 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025690070210631895, + "loss": 2.7522, + "theoretical_loss": 3.492122158908649, + "tokens_seen": 1621897216 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2587231, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7472214698791504, + "objective/train/theoretical_loss": 3.492109903675618, + "objective/train/tokens_used": 1642422752, + "theoretical_loss": 3.492109903675618, + "tokens_seen": 1621962752 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025689067201604813, + "loss": 2.8659, + "theoretical_loss": 3.492109903675618, + "tokens_seen": 1621962752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002568806419257773, + "loss": 2.9353, + "theoretical_loss": 3.4920976490763977, + "tokens_seen": 1622028288 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025687061183550655, + "loss": 2.7735, + "theoretical_loss": 3.492085395110929, + "tokens_seen": 1622093824 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002568605817452357, + "loss": 2.6661, + "theoretical_loss": 3.492073141779155, + "tokens_seen": 1622159360 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002568505516549649, + "loss": 2.9064, + "theoretical_loss": 3.4920608890810154, + "tokens_seen": 1622224896 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025684052156469404, + "loss": 2.797, + "theoretical_loss": 3.4920486370164525, + "tokens_seen": 1622290432 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025683049147442327, + "loss": 2.6135, + "theoretical_loss": 3.492036385585409, + "tokens_seen": 1622355968 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025682046138415245, + "loss": 2.7189, + "theoretical_loss": 3.492024134787825, + "tokens_seen": 1622421504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025681043129388163, + "loss": 2.6542, + "theoretical_loss": 3.4920118846236434, + "tokens_seen": 1622487040 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002568004012036108, + "loss": 2.8507, + "theoretical_loss": 3.491999635092805, + "tokens_seen": 1622552576 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025679037111334005, + "loss": 2.6892, + "theoretical_loss": 3.491987386195252, + "tokens_seen": 1622618112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002567803410230692, + "loss": 2.8196, + "theoretical_loss": 3.491975137930926, + "tokens_seen": 1622683648 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002567703109327984, + "loss": 2.8361, + "theoretical_loss": 3.491962890299768, + "tokens_seen": 1622749184 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025676028084252754, + "loss": 2.8743, + "theoretical_loss": 3.491950643301721, + "tokens_seen": 1622814720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002567502507522568, + "loss": 2.7279, + "theoretical_loss": 3.4919383969367255, + "tokens_seen": 1622880256 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256740220661986, + "loss": 2.7662, + "theoretical_loss": 3.491926151204724, + "tokens_seen": 1622945792 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025673019057171514, + "loss": 2.6677, + "theoretical_loss": 3.4919139061056583, + "tokens_seen": 1623011328 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025672016048144437, + "loss": 2.724, + "theoretical_loss": 3.491901661639469, + "tokens_seen": 1623076864 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002567101303911735, + "loss": 2.7141, + "theoretical_loss": 3.491889417806099, + "tokens_seen": 1623142400 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025670010030090273, + "loss": 2.8078, + "theoretical_loss": 3.4918771746054897, + "tokens_seen": 1623207936 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566900702106319, + "loss": 2.8813, + "theoretical_loss": 3.4918649320375827, + "tokens_seen": 1623273472 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566800401203611, + "loss": 2.8727, + "theoretical_loss": 3.49185269010232, + "tokens_seen": 1623339008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566700100300903, + "loss": 2.7298, + "theoretical_loss": 3.491840448799643, + "tokens_seen": 1623404544 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566599799398195, + "loss": 2.7289, + "theoretical_loss": 3.491828208129494, + "tokens_seen": 1623470080 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025664994984954864, + "loss": 2.8269, + "theoretical_loss": 3.491815968091814, + "tokens_seen": 1623535616 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2588581, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9668357372283936, + "objective/train/theoretical_loss": 3.4918037286865458, + "objective/train/tokens_used": 1644061152, + "theoretical_loss": 3.4918037286865458, + "tokens_seen": 1623601152 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566399197592779, + "loss": 2.9177, + "theoretical_loss": 3.4918037286865458, + "tokens_seen": 1623601152 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256629889669007, + "loss": 2.7892, + "theoretical_loss": 3.4917914899136306, + "tokens_seen": 1623666688 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025661985957873624, + "loss": 2.8517, + "theoretical_loss": 3.49177925177301, + "tokens_seen": 1623732224 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002566098294884654, + "loss": 2.8777, + "theoretical_loss": 3.491767014264626, + "tokens_seen": 1623797760 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565997993981946, + "loss": 2.8247, + "theoretical_loss": 3.4917547773884214, + "tokens_seen": 1623863296 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565897693079238, + "loss": 2.8458, + "theoretical_loss": 3.491742541144337, + "tokens_seen": 1623928832 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025657973921765296, + "loss": 2.8199, + "theoretical_loss": 3.491730305532314, + "tokens_seen": 1623994368 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025656970912738214, + "loss": 2.6156, + "theoretical_loss": 3.491718070552295, + "tokens_seen": 1624059904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565596790371114, + "loss": 2.7648, + "theoretical_loss": 3.491705836204223, + "tokens_seen": 1624125440 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565496489468405, + "loss": 2.6246, + "theoretical_loss": 3.491693602488038, + "tokens_seen": 1624190976 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025653961885656974, + "loss": 2.8159, + "theoretical_loss": 3.4916813694036826, + "tokens_seen": 1624256512 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025652958876629887, + "loss": 2.9042, + "theoretical_loss": 3.4916691369510993, + "tokens_seen": 1624322048 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565195586760281, + "loss": 2.7665, + "theoretical_loss": 3.491656905130229, + "tokens_seen": 1624387584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002565095285857573, + "loss": 2.7674, + "theoretical_loss": 3.491644673941014, + "tokens_seen": 1624453120 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025649949849548647, + "loss": 2.8809, + "theoretical_loss": 3.491632443383396, + "tokens_seen": 1624518656 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025648946840521565, + "loss": 2.6284, + "theoretical_loss": 3.4916202134573173, + "tokens_seen": 1624584192 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002564794383149449, + "loss": 2.5358, + "theoretical_loss": 3.4916079841627194, + "tokens_seen": 1624649728 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256469408224674, + "loss": 2.8884, + "theoretical_loss": 3.491595755499545, + "tokens_seen": 1624715264 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025645937813440324, + "loss": 2.8998, + "theoretical_loss": 3.4915835274677347, + "tokens_seen": 1624780800 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025644934804413237, + "loss": 2.7792, + "theoretical_loss": 3.491571300067232, + "tokens_seen": 1624846336 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002564393179538616, + "loss": 2.6597, + "theoretical_loss": 3.4915590732979775, + "tokens_seen": 1624911872 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002564292878635908, + "loss": 2.7725, + "theoretical_loss": 3.4915468471599143, + "tokens_seen": 1624977408 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025641925777331997, + "loss": 2.8404, + "theoretical_loss": 3.491534621652983, + "tokens_seen": 1625042944 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025640922768304915, + "loss": 2.7913, + "theoretical_loss": 3.4915223967771265, + "tokens_seen": 1625108480 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025639919759277833, + "loss": 2.8259, + "theoretical_loss": 3.491510172532287, + "tokens_seen": 1625174016 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2591456, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.072849988937378, + "objective/train/theoretical_loss": 3.491497948918406, + "objective/train/tokens_used": 1645699552, + "theoretical_loss": 3.491497948918406, + "tokens_seen": 1625239552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002563891675025075, + "loss": 2.7435, + "theoretical_loss": 3.491497948918406, + "tokens_seen": 1625239552 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025637913741223675, + "loss": 2.8199, + "theoretical_loss": 3.4914857259354255, + "tokens_seen": 1625305088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002563691073219659, + "loss": 2.7753, + "theoretical_loss": 3.491473503583287, + "tokens_seen": 1625370624 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002563590772316951, + "loss": 2.7631, + "theoretical_loss": 3.491461281861934, + "tokens_seen": 1625436160 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025634904714142424, + "loss": 2.9086, + "theoretical_loss": 3.4914490607713073, + "tokens_seen": 1625501696 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025633901705115347, + "loss": 2.8017, + "theoretical_loss": 3.491436840311349, + "tokens_seen": 1625567232 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025632898696088265, + "loss": 2.8035, + "theoretical_loss": 3.4914246204820016, + "tokens_seen": 1625632768 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025631895687061183, + "loss": 2.6095, + "theoretical_loss": 3.491412401283207, + "tokens_seen": 1625698304 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256308926780341, + "loss": 2.6867, + "theoretical_loss": 3.491400182714907, + "tokens_seen": 1625763840 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025629889669007025, + "loss": 2.6779, + "theoretical_loss": 3.4913879647770436, + "tokens_seen": 1625829376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002562888665997994, + "loss": 2.7351, + "theoretical_loss": 3.4913757474695597, + "tokens_seen": 1625894912 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002562788365095286, + "loss": 2.91, + "theoretical_loss": 3.491363530792396, + "tokens_seen": 1625960448 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025626880641925774, + "loss": 2.8902, + "theoretical_loss": 3.4913513147454958, + "tokens_seen": 1626025984 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256258776328987, + "loss": 2.8047, + "theoretical_loss": 3.4913390993288003, + "tokens_seen": 1626091520 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025624874623871616, + "loss": 2.6419, + "theoretical_loss": 3.4913268845422523, + "tokens_seen": 1626157056 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025623871614844534, + "loss": 2.8375, + "theoretical_loss": 3.4913146703857936, + "tokens_seen": 1626222592 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002562286860581745, + "loss": 2.6767, + "theoretical_loss": 3.491302456859366, + "tokens_seen": 1626288128 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002562186559679037, + "loss": 2.7587, + "theoretical_loss": 3.4912902439629123, + "tokens_seen": 1626353664 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002562086258776329, + "loss": 2.7198, + "theoretical_loss": 3.491278031696374, + "tokens_seen": 1626419200 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002561985957873621, + "loss": 2.7582, + "theoretical_loss": 3.4912658200596933, + "tokens_seen": 1626484736 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025618856569709124, + "loss": 2.9035, + "theoretical_loss": 3.4912536090528126, + "tokens_seen": 1626550272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002561785356068205, + "loss": 2.6929, + "theoretical_loss": 3.4912413986756743, + "tokens_seen": 1626615808 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025616850551654966, + "loss": 2.858, + "theoretical_loss": 3.4912291889282194, + "tokens_seen": 1626681344 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025615847542627884, + "loss": 2.9051, + "theoretical_loss": 3.4912169798103916, + "tokens_seen": 1626746880 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256148445336008, + "loss": 2.8335, + "theoretical_loss": 3.491204771322132, + "tokens_seen": 1626812416 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2594092, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7137722969055176, + "objective/train/theoretical_loss": 3.4911925634633834, + "objective/train/tokens_used": 1647337952, + "theoretical_loss": 3.4911925634633834, + "tokens_seen": 1626877952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002561384152457372, + "loss": 2.7013, + "theoretical_loss": 3.4911925634633834, + "tokens_seen": 1626877952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002561283851554664, + "loss": 2.7885, + "theoretical_loss": 3.491180356234087, + "tokens_seen": 1626943488 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002561183550651956, + "loss": 2.5928, + "theoretical_loss": 3.491168149634186, + "tokens_seen": 1627009024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025610832497492475, + "loss": 2.6909, + "theoretical_loss": 3.491155943663623, + "tokens_seen": 1627074560 + }, + { + "epoch": 4.08, + "learning_rate": 0.000256098294884654, + "loss": 2.8917, + "theoretical_loss": 3.4911437383223385, + "tokens_seen": 1627140096 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560882647943831, + "loss": 2.6691, + "theoretical_loss": 3.491131533610276, + "tokens_seen": 1627205632 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025607823470411234, + "loss": 2.6448, + "theoretical_loss": 3.4911193295273772, + "tokens_seen": 1627271168 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560682046138415, + "loss": 2.9233, + "theoretical_loss": 3.491107126073585, + "tokens_seen": 1627336704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560581745235707, + "loss": 2.8636, + "theoretical_loss": 3.4910949232488413, + "tokens_seen": 1627402240 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560481444332999, + "loss": 2.8402, + "theoretical_loss": 3.491082721053088, + "tokens_seen": 1627467776 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025603811434302907, + "loss": 2.7972, + "theoretical_loss": 3.491070519486267, + "tokens_seen": 1627533312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025602808425275825, + "loss": 2.8385, + "theoretical_loss": 3.4910583185483217, + "tokens_seen": 1627598848 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560180541624875, + "loss": 2.766, + "theoretical_loss": 3.491046118239194, + "tokens_seen": 1627664384 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002560080240722166, + "loss": 2.7599, + "theoretical_loss": 3.4910339185588257, + "tokens_seen": 1627729920 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025599799398194585, + "loss": 2.6329, + "theoretical_loss": 3.4910217195071596, + "tokens_seen": 1627795456 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002559879638916751, + "loss": 2.8798, + "theoretical_loss": 3.491009521084137, + "tokens_seen": 1627860992 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002559779338014042, + "loss": 2.8093, + "theoretical_loss": 3.490997323289702, + "tokens_seen": 1627926528 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025596790371113344, + "loss": 2.7309, + "theoretical_loss": 3.490985126123795, + "tokens_seen": 1627992064 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025595787362086257, + "loss": 2.7314, + "theoretical_loss": 3.4909729295863596, + "tokens_seen": 1628057600 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002559478435305918, + "loss": 2.7857, + "theoretical_loss": 3.490960733677338, + "tokens_seen": 1628123136 + }, + { + "epoch": 4.08, + "learning_rate": 0.000255937813440321, + "loss": 2.8744, + "theoretical_loss": 3.490948538396671, + "tokens_seen": 1628188672 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025592778335005017, + "loss": 2.7251, + "theoretical_loss": 3.4909363437443033, + "tokens_seen": 1628254208 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025591775325977935, + "loss": 2.8611, + "theoretical_loss": 3.4909241497201755, + "tokens_seen": 1628319744 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025590772316950853, + "loss": 2.7931, + "theoretical_loss": 3.490911956324231, + "tokens_seen": 1628385280 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002558976930792377, + "loss": 2.9671, + "theoretical_loss": 3.4908997635564116, + "tokens_seen": 1628450816 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2597038, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.817054271697998, + "objective/train/theoretical_loss": 3.4908875714166596, + "objective/train/tokens_used": 1648976352, + "theoretical_loss": 3.4908875714166596, + "tokens_seen": 1628516352 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025588766298896695, + "loss": 2.7705, + "theoretical_loss": 3.4908875714166596, + "tokens_seen": 1628516352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002558776328986961, + "loss": 2.8879, + "theoretical_loss": 3.4908753799049173, + "tokens_seen": 1628581888 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002558676028084253, + "loss": 2.5957, + "theoretical_loss": 3.4908631890211277, + "tokens_seen": 1628647424 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025585757271815444, + "loss": 2.7254, + "theoretical_loss": 3.490850998765233, + "tokens_seen": 1628712960 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025584754262788367, + "loss": 2.8591, + "theoretical_loss": 3.490838809137175, + "tokens_seen": 1628778496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025583751253761285, + "loss": 2.847, + "theoretical_loss": 3.4908266201368967, + "tokens_seen": 1628844032 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025582748244734203, + "loss": 2.9255, + "theoretical_loss": 3.490814431764341, + "tokens_seen": 1628909568 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002558174523570712, + "loss": 2.8232, + "theoretical_loss": 3.4908022440194486, + "tokens_seen": 1628975104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025580742226680045, + "loss": 2.7088, + "theoretical_loss": 3.4907900569021635, + "tokens_seen": 1629040640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557973921765296, + "loss": 2.7396, + "theoretical_loss": 3.4907778704124275, + "tokens_seen": 1629106176 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557873620862588, + "loss": 2.7554, + "theoretical_loss": 3.4907656845501833, + "tokens_seen": 1629171712 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025577733199598794, + "loss": 2.5978, + "theoretical_loss": 3.490753499315373, + "tokens_seen": 1629237248 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557673019057172, + "loss": 2.8805, + "theoretical_loss": 3.490741314707939, + "tokens_seen": 1629302784 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025575727181544636, + "loss": 2.8597, + "theoretical_loss": 3.4907291307278245, + "tokens_seen": 1629368320 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025574724172517554, + "loss": 2.723, + "theoretical_loss": 3.4907169473749713, + "tokens_seen": 1629433856 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557372116349047, + "loss": 2.7819, + "theoretical_loss": 3.4907047646493226, + "tokens_seen": 1629499392 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557271815446339, + "loss": 2.8109, + "theoretical_loss": 3.4906925825508197, + "tokens_seen": 1629564928 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557171514543631, + "loss": 2.682, + "theoretical_loss": 3.490680401079406, + "tokens_seen": 1629630464 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002557071213640923, + "loss": 2.6661, + "theoretical_loss": 3.4906682202350234, + "tokens_seen": 1629696000 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025569709127382144, + "loss": 2.7382, + "theoretical_loss": 3.4906560400176154, + "tokens_seen": 1629761536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556870611835507, + "loss": 2.7792, + "theoretical_loss": 3.4906438604271237, + "tokens_seen": 1629827072 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025567703109327986, + "loss": 2.8651, + "theoretical_loss": 3.490631681463491, + "tokens_seen": 1629892608 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025566700100300904, + "loss": 2.7729, + "theoretical_loss": 3.49061950312666, + "tokens_seen": 1629958144 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556569709127382, + "loss": 2.7478, + "theoretical_loss": 3.4906073254165726, + "tokens_seen": 1630023680 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556469408224674, + "loss": 2.7394, + "theoretical_loss": 3.4905951483331723, + "tokens_seen": 1630089216 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2599785, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5127370357513428, + "objective/train/theoretical_loss": 3.490582971876401, + "objective/train/tokens_used": 1650614752, + "theoretical_loss": 3.490582971876401, + "tokens_seen": 1630154752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556369107321966, + "loss": 2.8014, + "theoretical_loss": 3.490582971876401, + "tokens_seen": 1630154752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556268806419258, + "loss": 2.7861, + "theoretical_loss": 3.4905707960462014, + "tokens_seen": 1630220288 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025561685055165495, + "loss": 2.6281, + "theoretical_loss": 3.490558620842516, + "tokens_seen": 1630285824 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002556068204613842, + "loss": 2.8413, + "theoretical_loss": 3.490546446265288, + "tokens_seen": 1630351360 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555967903711133, + "loss": 2.7938, + "theoretical_loss": 3.4905342723144597, + "tokens_seen": 1630416896 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025558676028084254, + "loss": 2.7335, + "theoretical_loss": 3.490522098989973, + "tokens_seen": 1630482432 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555767301905717, + "loss": 2.844, + "theoretical_loss": 3.4905099262917707, + "tokens_seen": 1630547968 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555667001003009, + "loss": 2.6605, + "theoretical_loss": 3.4904977542197964, + "tokens_seen": 1630613504 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555566700100301, + "loss": 2.6907, + "theoretical_loss": 3.490485582773992, + "tokens_seen": 1630679040 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025554663991975927, + "loss": 2.8695, + "theoretical_loss": 3.4904734119542997, + "tokens_seen": 1630744576 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025553660982948845, + "loss": 2.6659, + "theoretical_loss": 3.490461241760663, + "tokens_seen": 1630810112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555265797392177, + "loss": 2.8196, + "theoretical_loss": 3.490449072193024, + "tokens_seen": 1630875648 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002555165496489468, + "loss": 2.7955, + "theoretical_loss": 3.490436903251325, + "tokens_seen": 1630941184 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025550651955867605, + "loss": 2.7775, + "theoretical_loss": 3.4904247349355098, + "tokens_seen": 1631006720 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025549648946840523, + "loss": 2.723, + "theoretical_loss": 3.49041256724552, + "tokens_seen": 1631072256 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554864593781344, + "loss": 2.7752, + "theoretical_loss": 3.490400400181299, + "tokens_seen": 1631137792 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554764292878636, + "loss": 2.7538, + "theoretical_loss": 3.490388233742789, + "tokens_seen": 1631203328 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025546639919759277, + "loss": 2.6775, + "theoretical_loss": 3.4903760679299323, + "tokens_seen": 1631268864 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025545636910732195, + "loss": 2.8416, + "theoretical_loss": 3.4903639027426725, + "tokens_seen": 1631334400 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554463390170512, + "loss": 2.5833, + "theoretical_loss": 3.4903517381809523, + "tokens_seen": 1631399936 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554363089267803, + "loss": 2.7341, + "theoretical_loss": 3.4903395742447136, + "tokens_seen": 1631465472 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025542627883650955, + "loss": 2.7911, + "theoretical_loss": 3.490327410933899, + "tokens_seen": 1631531008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554162487462387, + "loss": 2.8148, + "theoretical_loss": 3.4903152482484527, + "tokens_seen": 1631596544 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002554062186559679, + "loss": 2.5603, + "theoretical_loss": 3.490303086188316, + "tokens_seen": 1631662080 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553961885656971, + "loss": 2.9419, + "theoretical_loss": 3.4902909247534324, + "tokens_seen": 1631727616 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2602598, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.800208806991577, + "objective/train/theoretical_loss": 3.4902787639437443, + "objective/train/tokens_used": 1652253152, + "theoretical_loss": 3.4902787639437443, + "tokens_seen": 1631793152 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553861584754263, + "loss": 2.8178, + "theoretical_loss": 3.4902787639437443, + "tokens_seen": 1631793152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025537612838515546, + "loss": 2.5311, + "theoretical_loss": 3.4902666037591947, + "tokens_seen": 1631858688 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025536609829488464, + "loss": 2.7873, + "theoretical_loss": 3.4902544441997256, + "tokens_seen": 1631924224 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553560682046138, + "loss": 2.7158, + "theoretical_loss": 3.490242285265281, + "tokens_seen": 1631989760 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025534603811434305, + "loss": 2.7454, + "theoretical_loss": 3.4902301269558027, + "tokens_seen": 1632055296 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553360080240722, + "loss": 2.9294, + "theoretical_loss": 3.4902179692712334, + "tokens_seen": 1632120832 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553259779338014, + "loss": 2.801, + "theoretical_loss": 3.490205812211517, + "tokens_seen": 1632186368 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553159478435306, + "loss": 2.822, + "theoretical_loss": 3.4901936557765953, + "tokens_seen": 1632251904 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002553059177532598, + "loss": 2.8349, + "theoretical_loss": 3.490181499966411, + "tokens_seen": 1632317440 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025529588766298896, + "loss": 2.7883, + "theoretical_loss": 3.4901693447809077, + "tokens_seen": 1632382976 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025528585757271814, + "loss": 2.7736, + "theoretical_loss": 3.490157190220028, + "tokens_seen": 1632448512 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552758274824473, + "loss": 2.7202, + "theoretical_loss": 3.4901450362837148, + "tokens_seen": 1632514048 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025526579739217656, + "loss": 2.7935, + "theoretical_loss": 3.49013288297191, + "tokens_seen": 1632579584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552557673019057, + "loss": 2.7568, + "theoretical_loss": 3.4901207302845574, + "tokens_seen": 1632645120 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552457372116349, + "loss": 2.7182, + "theoretical_loss": 3.4901085782215997, + "tokens_seen": 1632710656 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552357071213641, + "loss": 2.6082, + "theoretical_loss": 3.49009642678298, + "tokens_seen": 1632776192 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552256770310933, + "loss": 2.7122, + "theoretical_loss": 3.4900842759686403, + "tokens_seen": 1632841728 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002552156469408225, + "loss": 2.6304, + "theoretical_loss": 3.490072125778524, + "tokens_seen": 1632907264 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025520561685055164, + "loss": 2.6705, + "theoretical_loss": 3.4900599762125744, + "tokens_seen": 1632972800 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551955867602809, + "loss": 2.8504, + "theoretical_loss": 3.4900478272707334, + "tokens_seen": 1633038336 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025518555667001006, + "loss": 2.7763, + "theoretical_loss": 3.490035678952945, + "tokens_seen": 1633103872 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025517552657973924, + "loss": 2.7297, + "theoretical_loss": 3.490023531259151, + "tokens_seen": 1633169408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551654964894684, + "loss": 2.6749, + "theoretical_loss": 3.4900113841892955, + "tokens_seen": 1633234944 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551554663991976, + "loss": 2.5979, + "theoretical_loss": 3.48999923774332, + "tokens_seen": 1633300480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551454363089268, + "loss": 2.9121, + "theoretical_loss": 3.489987091921169, + "tokens_seen": 1633366016 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2605386, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9682953357696533, + "objective/train/theoretical_loss": 3.4899749467227843, + "objective/train/tokens_used": 1653891552, + "theoretical_loss": 3.4899749467227843, + "tokens_seen": 1633431552 + }, + { + "epoch": 4.08, + "learning_rate": 0.000255135406218656, + "loss": 2.7445, + "theoretical_loss": 3.4899749467227843, + "tokens_seen": 1633431552 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025512537612838515, + "loss": 2.8183, + "theoretical_loss": 3.489962802148109, + "tokens_seen": 1633497088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551153460381144, + "loss": 2.7824, + "theoretical_loss": 3.4899506581970865, + "tokens_seen": 1633562624 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002551053159478435, + "loss": 2.7478, + "theoretical_loss": 3.4899385148696593, + "tokens_seen": 1633628160 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025509528585757274, + "loss": 2.7447, + "theoretical_loss": 3.4899263721657707, + "tokens_seen": 1633693696 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002550852557673019, + "loss": 2.7624, + "theoretical_loss": 3.489914230085364, + "tokens_seen": 1633759232 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002550752256770311, + "loss": 2.8821, + "theoretical_loss": 3.489902088628381, + "tokens_seen": 1633824768 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002550651955867603, + "loss": 2.7207, + "theoretical_loss": 3.489889947794766, + "tokens_seen": 1633890304 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025505516549648947, + "loss": 2.6335, + "theoretical_loss": 3.4898778075844605, + "tokens_seen": 1633955840 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025504513540621865, + "loss": 2.8171, + "theoretical_loss": 3.4898656679974094, + "tokens_seen": 1634021376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002550351053159479, + "loss": 2.8679, + "theoretical_loss": 3.489853529033554, + "tokens_seen": 1634086912 + }, + { + "epoch": 4.08, + "learning_rate": 0.000255025075225677, + "loss": 2.7833, + "theoretical_loss": 3.4898413906928383, + "tokens_seen": 1634152448 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025501504513540625, + "loss": 2.8843, + "theoretical_loss": 3.4898292529752046, + "tokens_seen": 1634217984 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025500501504513543, + "loss": 2.7766, + "theoretical_loss": 3.489817115880597, + "tokens_seen": 1634283520 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549949849548646, + "loss": 2.8122, + "theoretical_loss": 3.489804979408958, + "tokens_seen": 1634349056 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549849548645938, + "loss": 2.7737, + "theoretical_loss": 3.4897928435602297, + "tokens_seen": 1634414592 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025497492477432297, + "loss": 2.6968, + "theoretical_loss": 3.489780708334356, + "tokens_seen": 1634480128 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025496489468405215, + "loss": 2.7262, + "theoretical_loss": 3.489768573731281, + "tokens_seen": 1634545664 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549548645937814, + "loss": 2.618, + "theoretical_loss": 3.4897564397509457, + "tokens_seen": 1634611200 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549448345035105, + "loss": 2.7073, + "theoretical_loss": 3.4897443063932947, + "tokens_seen": 1634676736 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025493480441323975, + "loss": 2.6421, + "theoretical_loss": 3.489732173658271, + "tokens_seen": 1634742272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549247743229689, + "loss": 2.5576, + "theoretical_loss": 3.4897200415458167, + "tokens_seen": 1634807808 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549147442326981, + "loss": 2.6553, + "theoretical_loss": 3.4897079100558757, + "tokens_seen": 1634873344 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002549047141424273, + "loss": 2.6604, + "theoretical_loss": 3.489695779188391, + "tokens_seen": 1634938880 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002548946840521565, + "loss": 2.8277, + "theoretical_loss": 3.489683648943305, + "tokens_seen": 1635004416 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2607959, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9202136993408203, + "objective/train/theoretical_loss": 3.489671519320562, + "objective/train/tokens_used": 1655529952, + "theoretical_loss": 3.489671519320562, + "tokens_seen": 1635069952 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025488465396188566, + "loss": 2.8403, + "theoretical_loss": 3.489671519320562, + "tokens_seen": 1635069952 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025487462387161484, + "loss": 2.6937, + "theoretical_loss": 3.4896593903201047, + "tokens_seen": 1635135488 + }, + { + "epoch": 4.08, + "learning_rate": 0.000254864593781344, + "loss": 2.7425, + "theoretical_loss": 3.4896472619418755, + "tokens_seen": 1635201024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025485456369107325, + "loss": 2.5422, + "theoretical_loss": 3.4896351341858187, + "tokens_seen": 1635266560 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002548445336008024, + "loss": 2.7154, + "theoretical_loss": 3.4896230070518763, + "tokens_seen": 1635332096 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002548345035105316, + "loss": 2.8338, + "theoretical_loss": 3.4896108805399924, + "tokens_seen": 1635397632 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002548244734202608, + "loss": 2.8908, + "theoretical_loss": 3.4895987546501095, + "tokens_seen": 1635463168 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025481444332999, + "loss": 2.7903, + "theoretical_loss": 3.4895866293821713, + "tokens_seen": 1635528704 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025480441323971916, + "loss": 2.8133, + "theoretical_loss": 3.4895745047361206, + "tokens_seen": 1635594240 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025479438314944834, + "loss": 2.7607, + "theoretical_loss": 3.4895623807119014, + "tokens_seen": 1635659776 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002547843530591775, + "loss": 2.8569, + "theoretical_loss": 3.4895502573094554, + "tokens_seen": 1635725312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025477432296890676, + "loss": 2.8893, + "theoretical_loss": 3.4895381345287273, + "tokens_seen": 1635790848 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002547642928786359, + "loss": 2.7069, + "theoretical_loss": 3.489526012369659, + "tokens_seen": 1635856384 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002547542627883651, + "loss": 2.8161, + "theoretical_loss": 3.489513890832195, + "tokens_seen": 1635921920 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025474423269809425, + "loss": 2.7746, + "theoretical_loss": 3.489501769916277, + "tokens_seen": 1635987456 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002547342026078235, + "loss": 2.7814, + "theoretical_loss": 3.4894896496218504, + "tokens_seen": 1636052992 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025472417251755266, + "loss": 2.7752, + "theoretical_loss": 3.4894775299488563, + "tokens_seen": 1636118528 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025471414242728184, + "loss": 2.6381, + "theoretical_loss": 3.489465410897239, + "tokens_seen": 1636184064 + }, + { + "epoch": 4.08, + "learning_rate": 0.000254704112337011, + "loss": 2.638, + "theoretical_loss": 3.4894532924669415, + "tokens_seen": 1636249600 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025469408224674026, + "loss": 2.6749, + "theoretical_loss": 3.489441174657907, + "tokens_seen": 1636315136 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002546840521564694, + "loss": 2.799, + "theoretical_loss": 3.4894290574700797, + "tokens_seen": 1636380672 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002546740220661986, + "loss": 2.8422, + "theoretical_loss": 3.4894169409034013, + "tokens_seen": 1636446208 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025466399197592775, + "loss": 2.7004, + "theoretical_loss": 3.4894048249578162, + "tokens_seen": 1636511744 + }, + { + "epoch": 4.08, + "learning_rate": 0.000254653961885657, + "loss": 2.8199, + "theoretical_loss": 3.4893927096332673, + "tokens_seen": 1636577280 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025464393179538616, + "loss": 2.8143, + "theoretical_loss": 3.4893805949296977, + "tokens_seen": 1636642816 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2609380, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.722306251525879, + "objective/train/theoretical_loss": 3.489368480847051, + "objective/train/tokens_used": 1657168352, + "theoretical_loss": 3.489368480847051, + "tokens_seen": 1636708352 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025463390170511535, + "loss": 2.7192, + "theoretical_loss": 3.489368480847051, + "tokens_seen": 1636708352 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025462387161484453, + "loss": 2.8923, + "theoretical_loss": 3.4893563673852706, + "tokens_seen": 1636773888 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002546138415245737, + "loss": 2.8217, + "theoretical_loss": 3.4893442545443003, + "tokens_seen": 1636839424 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002546038114343029, + "loss": 2.6541, + "theoretical_loss": 3.489332142324082, + "tokens_seen": 1636904960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545937813440321, + "loss": 2.7236, + "theoretical_loss": 3.48932003072456, + "tokens_seen": 1636970496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025458375125376125, + "loss": 2.6529, + "theoretical_loss": 3.489307919745678, + "tokens_seen": 1637036032 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545737211634905, + "loss": 2.799, + "theoretical_loss": 3.4892958093873783, + "tokens_seen": 1637101568 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545636910732196, + "loss": 2.8343, + "theoretical_loss": 3.4892836996496053, + "tokens_seen": 1637167104 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025455366098294885, + "loss": 2.7128, + "theoretical_loss": 3.4892715905323017, + "tokens_seen": 1637232640 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025454363089267803, + "loss": 2.9273, + "theoretical_loss": 3.489259482035411, + "tokens_seen": 1637298176 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545336008024072, + "loss": 2.7206, + "theoretical_loss": 3.4892473741588765, + "tokens_seen": 1637363712 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545235707121364, + "loss": 2.6425, + "theoretical_loss": 3.489235266902642, + "tokens_seen": 1637429248 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025451354062186563, + "loss": 2.7664, + "theoretical_loss": 3.4892231602666506, + "tokens_seen": 1637494784 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002545035105315948, + "loss": 2.9195, + "theoretical_loss": 3.4892110542508457, + "tokens_seen": 1637560320 + }, + { + "epoch": 4.08, + "learning_rate": 0.000254493480441324, + "loss": 2.7421, + "theoretical_loss": 3.4891989488551705, + "tokens_seen": 1637625856 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025448345035105317, + "loss": 2.6742, + "theoretical_loss": 3.489186844079569, + "tokens_seen": 1637691392 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025447342026078235, + "loss": 2.6579, + "theoretical_loss": 3.4891747399239845, + "tokens_seen": 1637756928 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544633901705116, + "loss": 2.7699, + "theoretical_loss": 3.4891626363883597, + "tokens_seen": 1637822464 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544533600802407, + "loss": 2.6905, + "theoretical_loss": 3.489150533472639, + "tokens_seen": 1637888000 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025444332998996995, + "loss": 2.6909, + "theoretical_loss": 3.489138431176765, + "tokens_seen": 1637953536 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544332998996991, + "loss": 2.8616, + "theoretical_loss": 3.4891263295006816, + "tokens_seen": 1638019072 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544232698094283, + "loss": 2.8094, + "theoretical_loss": 3.4891142284443326, + "tokens_seen": 1638084608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544132397191575, + "loss": 2.6412, + "theoretical_loss": 3.489102128007661, + "tokens_seen": 1638150144 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002544032096288867, + "loss": 2.8118, + "theoretical_loss": 3.4890900281906103, + "tokens_seen": 1638215680 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025439317953861586, + "loss": 2.8174, + "theoretical_loss": 3.489077928993124, + "tokens_seen": 1638281216 + }, + { + "debugging/Self-BLEU-5": 0.5599754102347919, + "debugging/distinct-1-grams": 0.7502248883362943, + "debugging/distinct-2-grams": 0.9456351257221094, + "debugging/entropy-1-grams": 6.115918752584259, + "debugging/entropy-2-grams": 7.156358321383862, + "debugging/length": 526.2777777777778, + "debugging/num_segments": 18, + "epoch": 4.08, + "objective/train/docs_used": 2612137, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7066800594329834, + "objective/train/theoretical_loss": 3.4890658304151456, + "objective/train/tokens_used": 1658806752, + "theoretical_loss": 3.4890658304151456, + "tokens_seen": 1638346752 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025438314944834504, + "loss": 2.6649, + "theoretical_loss": 3.4890658304151456, + "tokens_seen": 1638346752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002543731193580742, + "loss": 2.6973, + "theoretical_loss": 3.489053732456619, + "tokens_seen": 1638412288 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025436308926780345, + "loss": 2.6216, + "theoretical_loss": 3.4890416351174873, + "tokens_seen": 1638477824 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002543530591775326, + "loss": 2.7002, + "theoretical_loss": 3.489029538397694, + "tokens_seen": 1638543360 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002543430290872618, + "loss": 2.7776, + "theoretical_loss": 3.489017442297183, + "tokens_seen": 1638608896 + }, + { + "epoch": 4.08, + "learning_rate": 0.000254332998996991, + "loss": 2.7404, + "theoretical_loss": 3.489005346815897, + "tokens_seen": 1638674432 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002543229689067202, + "loss": 2.755, + "theoretical_loss": 3.4889932519537803, + "tokens_seen": 1638739968 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025431293881644936, + "loss": 2.77, + "theoretical_loss": 3.4889811577107763, + "tokens_seen": 1638805504 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025430290872617854, + "loss": 2.7009, + "theoretical_loss": 3.4889690640868283, + "tokens_seen": 1638871040 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002542928786359077, + "loss": 2.7588, + "theoretical_loss": 3.4889569710818806, + "tokens_seen": 1638936576 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025428284854563696, + "loss": 2.655, + "theoretical_loss": 3.4889448786958757, + "tokens_seen": 1639002112 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002542728184553661, + "loss": 2.9625, + "theoretical_loss": 3.488932786928758, + "tokens_seen": 1639067648 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002542627883650953, + "loss": 2.6932, + "theoretical_loss": 3.48892069578047, + "tokens_seen": 1639133184 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025425275827482445, + "loss": 2.6798, + "theoretical_loss": 3.4889086052509564, + "tokens_seen": 1639198720 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002542427281845537, + "loss": 2.8523, + "theoretical_loss": 3.488896515340161, + "tokens_seen": 1639264256 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025423269809428286, + "loss": 2.6705, + "theoretical_loss": 3.4888844260480267, + "tokens_seen": 1639329792 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025422266800401204, + "loss": 2.857, + "theoretical_loss": 3.488872337374497, + "tokens_seen": 1639395328 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002542126379137412, + "loss": 2.8167, + "theoretical_loss": 3.4888602493195155, + "tokens_seen": 1639460864 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025420260782347046, + "loss": 2.724, + "theoretical_loss": 3.4888481618830265, + "tokens_seen": 1639526400 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541925777331996, + "loss": 2.7608, + "theoretical_loss": 3.488836075064973, + "tokens_seen": 1639591936 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541825476429288, + "loss": 2.7617, + "theoretical_loss": 3.488823988865299, + "tokens_seen": 1639657472 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025417251755265795, + "loss": 2.7946, + "theoretical_loss": 3.488811903283948, + "tokens_seen": 1639723008 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541624874623872, + "loss": 2.7934, + "theoretical_loss": 3.4887998183208637, + "tokens_seen": 1639788544 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025415245737211637, + "loss": 2.6966, + "theoretical_loss": 3.4887877339759905, + "tokens_seen": 1639854080 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025414242728184555, + "loss": 2.6845, + "theoretical_loss": 3.4887756502492704, + "tokens_seen": 1639919616 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2614983, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4480674266815186, + "objective/train/theoretical_loss": 3.4887635671406483, + "objective/train/tokens_used": 1660445152, + "theoretical_loss": 3.4887635671406483, + "tokens_seen": 1639985152 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025413239719157473, + "loss": 2.6455, + "theoretical_loss": 3.4887635671406483, + "tokens_seen": 1639985152 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541223671013039, + "loss": 2.8015, + "theoretical_loss": 3.4887514846500673, + "tokens_seen": 1640050688 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541123370110331, + "loss": 2.769, + "theoretical_loss": 3.4887394027774716, + "tokens_seen": 1640116224 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002541023069207623, + "loss": 2.6984, + "theoretical_loss": 3.4887273215228047, + "tokens_seen": 1640181760 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025409227683049145, + "loss": 2.7517, + "theoretical_loss": 3.4887152408860103, + "tokens_seen": 1640247296 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002540822467402207, + "loss": 2.8855, + "theoretical_loss": 3.4887031608670322, + "tokens_seen": 1640312832 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002540722166499498, + "loss": 2.8271, + "theoretical_loss": 3.488691081465814, + "tokens_seen": 1640378368 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025406218655967905, + "loss": 2.7086, + "theoretical_loss": 3.488679002682299, + "tokens_seen": 1640443904 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025405215646940823, + "loss": 2.7922, + "theoretical_loss": 3.4886669245164317, + "tokens_seen": 1640509440 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002540421263791374, + "loss": 2.8586, + "theoretical_loss": 3.4886548469681555, + "tokens_seen": 1640574976 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002540320962888666, + "loss": 2.7761, + "theoretical_loss": 3.4886427700374147, + "tokens_seen": 1640640512 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025402206619859583, + "loss": 2.7153, + "theoretical_loss": 3.488630693724152, + "tokens_seen": 1640706048 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025401203610832495, + "loss": 2.8071, + "theoretical_loss": 3.488618618028312, + "tokens_seen": 1640771584 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002540020060180542, + "loss": 2.7091, + "theoretical_loss": 3.488606542949838, + "tokens_seen": 1640837120 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539919759277833, + "loss": 2.8645, + "theoretical_loss": 3.4885944684886736, + "tokens_seen": 1640902656 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025398194583751255, + "loss": 2.735, + "theoretical_loss": 3.4885823946447636, + "tokens_seen": 1640968192 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025397191574724173, + "loss": 2.6803, + "theoretical_loss": 3.4885703214180506, + "tokens_seen": 1641033728 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539618856569709, + "loss": 2.7449, + "theoretical_loss": 3.488558248808479, + "tokens_seen": 1641099264 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539518555667001, + "loss": 2.9335, + "theoretical_loss": 3.4885461768159933, + "tokens_seen": 1641164800 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539418254764293, + "loss": 2.7527, + "theoretical_loss": 3.488534105440536, + "tokens_seen": 1641230336 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025393179538615846, + "loss": 2.7144, + "theoretical_loss": 3.4885220346820516, + "tokens_seen": 1641295872 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539217652958877, + "loss": 2.6508, + "theoretical_loss": 3.4885099645404836, + "tokens_seen": 1641361408 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002539117352056168, + "loss": 2.91, + "theoretical_loss": 3.488497895015777, + "tokens_seen": 1641426944 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025390170511534606, + "loss": 2.9232, + "theoretical_loss": 3.4884858261078735, + "tokens_seen": 1641492480 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538916750250752, + "loss": 2.8666, + "theoretical_loss": 3.4884737578167186, + "tokens_seen": 1641558016 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2617888, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.764310121536255, + "objective/train/theoretical_loss": 3.488461690142256, + "objective/train/tokens_used": 1662083552, + "theoretical_loss": 3.488461690142256, + "tokens_seen": 1641623552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538816449348044, + "loss": 2.5981, + "theoretical_loss": 3.488461690142256, + "tokens_seen": 1641623552 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538716148445336, + "loss": 2.963, + "theoretical_loss": 3.488449623084429, + "tokens_seen": 1641689088 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538615847542628, + "loss": 2.7455, + "theoretical_loss": 3.488437556643182, + "tokens_seen": 1641754624 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025385155466399196, + "loss": 2.7275, + "theoretical_loss": 3.4884254908184587, + "tokens_seen": 1641820160 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538415245737212, + "loss": 2.8503, + "theoretical_loss": 3.488413425610203, + "tokens_seen": 1641885696 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538314944834503, + "loss": 2.8446, + "theoretical_loss": 3.488401361018359, + "tokens_seen": 1641951232 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025382146439317956, + "loss": 2.7523, + "theoretical_loss": 3.4883892970428696, + "tokens_seen": 1642016768 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538114343029087, + "loss": 2.8312, + "theoretical_loss": 3.4883772336836802, + "tokens_seen": 1642082304 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002538014042126379, + "loss": 2.6521, + "theoretical_loss": 3.4883651709407335, + "tokens_seen": 1642147840 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002537913741223671, + "loss": 2.8943, + "theoretical_loss": 3.4883531088139743, + "tokens_seen": 1642213376 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002537813440320963, + "loss": 2.6789, + "theoretical_loss": 3.4883410473033463, + "tokens_seen": 1642278912 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025377131394182546, + "loss": 2.5768, + "theoretical_loss": 3.488328986408793, + "tokens_seen": 1642344448 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025376128385155465, + "loss": 2.7554, + "theoretical_loss": 3.4883169261302585, + "tokens_seen": 1642409984 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002537512537612839, + "loss": 2.8892, + "theoretical_loss": 3.4883048664676872, + "tokens_seen": 1642475520 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025374122367101306, + "loss": 2.7358, + "theoretical_loss": 3.488292807421023, + "tokens_seen": 1642541056 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025373119358074224, + "loss": 2.807, + "theoretical_loss": 3.488280748990209, + "tokens_seen": 1642606592 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002537211634904714, + "loss": 2.7956, + "theoretical_loss": 3.4882686911751906, + "tokens_seen": 1642672128 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025371113340020066, + "loss": 2.7251, + "theoretical_loss": 3.4882566339759107, + "tokens_seen": 1642737664 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002537011033099298, + "loss": 2.9745, + "theoretical_loss": 3.4882445773923134, + "tokens_seen": 1642803200 + }, + { + "epoch": 4.08, + "learning_rate": 0.000253691073219659, + "loss": 2.6883, + "theoretical_loss": 3.488232521424343, + "tokens_seen": 1642868736 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025368104312938815, + "loss": 2.8426, + "theoretical_loss": 3.488220466071944, + "tokens_seen": 1642934272 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002536710130391174, + "loss": 2.6409, + "theoretical_loss": 3.4882084113350587, + "tokens_seen": 1642999808 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025366098294884657, + "loss": 2.6822, + "theoretical_loss": 3.488196357213633, + "tokens_seen": 1643065344 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025365095285857575, + "loss": 2.5632, + "theoretical_loss": 3.4881843037076106, + "tokens_seen": 1643130880 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025364092276830493, + "loss": 2.9625, + "theoretical_loss": 3.488172250816935, + "tokens_seen": 1643196416 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2620141, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.706641435623169, + "objective/train/theoretical_loss": 3.4881601985415496, + "objective/train/tokens_used": 1663721952, + "theoretical_loss": 3.4881601985415496, + "tokens_seen": 1643261952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002536308926780341, + "loss": 2.8676, + "theoretical_loss": 3.4881601985415496, + "tokens_seen": 1643261952 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002536208625877633, + "loss": 2.7626, + "theoretical_loss": 3.4881481468813997, + "tokens_seen": 1643327488 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002536108324974925, + "loss": 2.6156, + "theoretical_loss": 3.4881360958364294, + "tokens_seen": 1643393024 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025360080240722165, + "loss": 2.6282, + "theoretical_loss": 3.4881240454065816, + "tokens_seen": 1643458560 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002535907723169509, + "loss": 2.8543, + "theoretical_loss": 3.488111995591801, + "tokens_seen": 1643524096 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025358074222668, + "loss": 2.7948, + "theoretical_loss": 3.4880999463920324, + "tokens_seen": 1643589632 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025357071213640925, + "loss": 2.898, + "theoretical_loss": 3.4880878978072185, + "tokens_seen": 1643655168 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025356068204613843, + "loss": 2.838, + "theoretical_loss": 3.4880758498373043, + "tokens_seen": 1643720704 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002535506519558676, + "loss": 2.8603, + "theoretical_loss": 3.488063802482234, + "tokens_seen": 1643786240 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002535406218655968, + "loss": 2.6919, + "theoretical_loss": 3.4880517557419513, + "tokens_seen": 1643851776 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025353059177532603, + "loss": 2.7898, + "theoretical_loss": 3.4880397096164, + "tokens_seen": 1643917312 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025352056168505516, + "loss": 2.7727, + "theoretical_loss": 3.488027664105525, + "tokens_seen": 1643982848 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002535105315947844, + "loss": 2.73, + "theoretical_loss": 3.4880156192092704, + "tokens_seen": 1644048384 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002535005015045135, + "loss": 2.8272, + "theoretical_loss": 3.48800357492758, + "tokens_seen": 1644113920 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025349047141424275, + "loss": 2.8022, + "theoretical_loss": 3.4879915312603975, + "tokens_seen": 1644179456 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025348044132397193, + "loss": 2.7559, + "theoretical_loss": 3.4879794882076682, + "tokens_seen": 1644244992 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002534704112337011, + "loss": 2.7407, + "theoretical_loss": 3.487967445769335, + "tokens_seen": 1644310528 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002534603811434303, + "loss": 2.6708, + "theoretical_loss": 3.4879554039453433, + "tokens_seen": 1644376064 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002534503510531595, + "loss": 2.7842, + "theoretical_loss": 3.487943362735636, + "tokens_seen": 1644441600 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025344032096288866, + "loss": 2.9052, + "theoretical_loss": 3.487931322140158, + "tokens_seen": 1644507136 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002534302908726179, + "loss": 2.6296, + "theoretical_loss": 3.4879192821588543, + "tokens_seen": 1644572672 + }, + { + "epoch": 4.08, + "learning_rate": 0.000253420260782347, + "loss": 2.8159, + "theoretical_loss": 3.4879072427916675, + "tokens_seen": 1644638208 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025341023069207626, + "loss": 2.6321, + "theoretical_loss": 3.4878952040385425, + "tokens_seen": 1644703744 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002534002006018054, + "loss": 2.6238, + "theoretical_loss": 3.4878831658994236, + "tokens_seen": 1644769280 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533901705115346, + "loss": 2.6792, + "theoretical_loss": 3.4878711283742554, + "tokens_seen": 1644834816 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2622781, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.009021759033203, + "objective/train/theoretical_loss": 3.4878590914629815, + "objective/train/tokens_used": 1665360352, + "theoretical_loss": 3.4878590914629815, + "tokens_seen": 1644900352 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533801404212638, + "loss": 2.8645, + "theoretical_loss": 3.4878590914629815, + "tokens_seen": 1644900352 + }, + { + "epoch": 4.08, + "learning_rate": 0.000253370110330993, + "loss": 2.7337, + "theoretical_loss": 3.487847055165546, + "tokens_seen": 1644965888 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025336008024072216, + "loss": 2.6194, + "theoretical_loss": 3.487835019481894, + "tokens_seen": 1645031424 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533500501504514, + "loss": 2.7091, + "theoretical_loss": 3.4878229844119684, + "tokens_seen": 1645096960 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533400200601805, + "loss": 2.8144, + "theoretical_loss": 3.487810949955715, + "tokens_seen": 1645162496 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025332998996990976, + "loss": 2.7681, + "theoretical_loss": 3.487798916113077, + "tokens_seen": 1645228032 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533199598796389, + "loss": 2.5793, + "theoretical_loss": 3.4877868828839995, + "tokens_seen": 1645293568 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002533099297893681, + "loss": 2.8251, + "theoretical_loss": 3.487774850268426, + "tokens_seen": 1645359104 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002532998996990973, + "loss": 2.7861, + "theoretical_loss": 3.487762818266301, + "tokens_seen": 1645424640 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002532898696088265, + "loss": 2.9023, + "theoretical_loss": 3.4877507868775686, + "tokens_seen": 1645490176 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025327983951855566, + "loss": 2.6017, + "theoretical_loss": 3.487738756102174, + "tokens_seen": 1645555712 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025326980942828485, + "loss": 2.8273, + "theoretical_loss": 3.487726725940061, + "tokens_seen": 1645621248 + }, + { + "epoch": 4.08, + "learning_rate": 0.000253259779338014, + "loss": 2.6983, + "theoretical_loss": 3.487714696391173, + "tokens_seen": 1645686784 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025324974924774326, + "loss": 2.7257, + "theoretical_loss": 3.487702667455456, + "tokens_seen": 1645752320 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002532397191574724, + "loss": 2.7526, + "theoretical_loss": 3.487690639132853, + "tokens_seen": 1645817856 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002532296890672016, + "loss": 2.7848, + "theoretical_loss": 3.4876786114233087, + "tokens_seen": 1645883392 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025321965897693075, + "loss": 2.7977, + "theoretical_loss": 3.487666584326768, + "tokens_seen": 1645948928 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025320962888666, + "loss": 2.7308, + "theoretical_loss": 3.4876545578431744, + "tokens_seen": 1646014464 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025319959879638917, + "loss": 3.0773, + "theoretical_loss": 3.4876425319724724, + "tokens_seen": 1646080000 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025318956870611835, + "loss": 2.6792, + "theoretical_loss": 3.4876305067146074, + "tokens_seen": 1646145536 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025317953861584753, + "loss": 2.7992, + "theoretical_loss": 3.4876184820695224, + "tokens_seen": 1646211072 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025316950852557677, + "loss": 2.7411, + "theoretical_loss": 3.4876064580371624, + "tokens_seen": 1646276608 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002531594784353059, + "loss": 2.7559, + "theoretical_loss": 3.4875944346174723, + "tokens_seen": 1646342144 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025314944834503513, + "loss": 2.6848, + "theoretical_loss": 3.4875824118103953, + "tokens_seen": 1646407680 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025313941825476425, + "loss": 2.7572, + "theoretical_loss": 3.4875703896158767, + "tokens_seen": 1646473216 + }, + { + "epoch": 4.08, + "objective/train/docs_used": 2625761, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5998599529266357, + "objective/train/theoretical_loss": 3.4875583680338607, + "objective/train/tokens_used": 1666998752, + "theoretical_loss": 3.4875583680338607, + "tokens_seen": 1646538752 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002531293881644935, + "loss": 2.7904, + "theoretical_loss": 3.4875583680338607, + "tokens_seen": 1646538752 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025311935807422267, + "loss": 2.642, + "theoretical_loss": 3.487546347064292, + "tokens_seen": 1646604288 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025310932798395185, + "loss": 2.9144, + "theoretical_loss": 3.487534326707114, + "tokens_seen": 1646669824 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025309929789368103, + "loss": 2.6151, + "theoretical_loss": 3.4875223069622727, + "tokens_seen": 1646735360 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002530892678034102, + "loss": 2.692, + "theoretical_loss": 3.4875102878297106, + "tokens_seen": 1646800896 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002530792377131394, + "loss": 2.8006, + "theoretical_loss": 3.4874982693093743, + "tokens_seen": 1646866432 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025306920762286863, + "loss": 2.7964, + "theoretical_loss": 3.4874862514012066, + "tokens_seen": 1646931968 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025305917753259776, + "loss": 2.619, + "theoretical_loss": 3.487474234105153, + "tokens_seen": 1646997504 + }, + { + "epoch": 4.08, + "learning_rate": 0.000253049147442327, + "loss": 2.7747, + "theoretical_loss": 3.4874622174211574, + "tokens_seen": 1647063040 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002530391173520562, + "loss": 2.7306, + "theoretical_loss": 3.487450201349164, + "tokens_seen": 1647128576 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025302908726178536, + "loss": 2.7613, + "theoretical_loss": 3.4874381858891175, + "tokens_seen": 1647194112 + }, + { + "epoch": 4.08, + "learning_rate": 0.00025301905717151454, + "loss": 2.5992, + "theoretical_loss": 3.487426171040963, + "tokens_seen": 1647259648 + }, + { + "epoch": 4.08, + "learning_rate": 0.0002530090270812437, + "loss": 2.713, + "theoretical_loss": 3.4874141568046446, + "tokens_seen": 1647325184 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025299899699097295, + "loss": 2.8184, + "theoretical_loss": 3.4874021431801068, + "tokens_seen": 1647390720 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025298896690070213, + "loss": 2.7681, + "theoretical_loss": 3.487390130167294, + "tokens_seen": 1647456256 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529789368104313, + "loss": 2.7371, + "theoretical_loss": 3.4873781177661507, + "tokens_seen": 1647521792 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529689067201605, + "loss": 2.7674, + "theoretical_loss": 3.4873661059766214, + "tokens_seen": 1647587328 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529588766298897, + "loss": 2.6969, + "theoretical_loss": 3.487354094798651, + "tokens_seen": 1647652864 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025294884653961886, + "loss": 2.797, + "theoretical_loss": 3.487342084232184, + "tokens_seen": 1647718400 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529388164493481, + "loss": 2.7205, + "theoretical_loss": 3.487330074277164, + "tokens_seen": 1647783936 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529287863590772, + "loss": 2.7013, + "theoretical_loss": 3.487318064933537, + "tokens_seen": 1647849472 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025291875626880646, + "loss": 2.5953, + "theoretical_loss": 3.4873060562012466, + "tokens_seen": 1647915008 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002529087261785356, + "loss": 2.6819, + "theoretical_loss": 3.4872940480802375, + "tokens_seen": 1647980544 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528986960882648, + "loss": 2.7669, + "theoretical_loss": 3.4872820405704545, + "tokens_seen": 1648046080 + }, + { + "epoch": 4.09, + "learning_rate": 0.000252888665997994, + "loss": 2.7369, + "theoretical_loss": 3.4872700336718423, + "tokens_seen": 1648111616 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2627138, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6203854084014893, + "objective/train/theoretical_loss": 3.487258027384345, + "objective/train/tokens_used": 1668637152, + "theoretical_loss": 3.487258027384345, + "tokens_seen": 1648177152 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528786359077232, + "loss": 2.7796, + "theoretical_loss": 3.487258027384345, + "tokens_seen": 1648177152 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025286860581745236, + "loss": 2.8607, + "theoretical_loss": 3.4872460217079073, + "tokens_seen": 1648242688 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528585757271816, + "loss": 2.7476, + "theoretical_loss": 3.4872340166424745, + "tokens_seen": 1648308224 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528485456369107, + "loss": 2.7478, + "theoretical_loss": 3.48722201218799, + "tokens_seen": 1648373760 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025283851554663996, + "loss": 2.8283, + "theoretical_loss": 3.4872100083444, + "tokens_seen": 1648439296 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528284854563691, + "loss": 2.6584, + "theoretical_loss": 3.4871980051116482, + "tokens_seen": 1648504832 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528184553660983, + "loss": 2.7001, + "theoretical_loss": 3.4871860024896786, + "tokens_seen": 1648570368 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002528084252758275, + "loss": 2.6605, + "theoretical_loss": 3.487174000478437, + "tokens_seen": 1648635904 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002527983951855567, + "loss": 2.7518, + "theoretical_loss": 3.487161999077867, + "tokens_seen": 1648701440 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025278836509528586, + "loss": 2.6791, + "theoretical_loss": 3.487149998287914, + "tokens_seen": 1648766976 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025277833500501505, + "loss": 2.8264, + "theoretical_loss": 3.487137998108523, + "tokens_seen": 1648832512 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002527683049147442, + "loss": 2.7184, + "theoretical_loss": 3.487125998539638, + "tokens_seen": 1648898048 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025275827482447346, + "loss": 2.8058, + "theoretical_loss": 3.4871139995812035, + "tokens_seen": 1648963584 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002527482447342026, + "loss": 2.7868, + "theoretical_loss": 3.487102001233165, + "tokens_seen": 1649029120 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002527382146439318, + "loss": 2.7161, + "theoretical_loss": 3.4870900034954664, + "tokens_seen": 1649094656 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025272818455366095, + "loss": 2.5844, + "theoretical_loss": 3.4870780063680527, + "tokens_seen": 1649160192 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002527181544633902, + "loss": 2.9054, + "theoretical_loss": 3.4870660098508686, + "tokens_seen": 1649225728 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025270812437311937, + "loss": 2.7755, + "theoretical_loss": 3.487054013943859, + "tokens_seen": 1649291264 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025269809428284855, + "loss": 2.7477, + "theoretical_loss": 3.487042018646968, + "tokens_seen": 1649356800 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025268806419257773, + "loss": 2.7131, + "theoretical_loss": 3.4870300239601413, + "tokens_seen": 1649422336 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025267803410230697, + "loss": 2.9488, + "theoretical_loss": 3.487018029883323, + "tokens_seen": 1649487872 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002526680040120361, + "loss": 2.7615, + "theoretical_loss": 3.487006036416458, + "tokens_seen": 1649553408 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025265797392176533, + "loss": 2.6374, + "theoretical_loss": 3.4869940435594904, + "tokens_seen": 1649618944 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025264794383149445, + "loss": 2.8659, + "theoretical_loss": 3.486982051312366, + "tokens_seen": 1649684480 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002526379137412237, + "loss": 2.817, + "theoretical_loss": 3.4869700596750297, + "tokens_seen": 1649750016 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2629919, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.656588554382324, + "objective/train/theoretical_loss": 3.486958068647425, + "objective/train/tokens_used": 1670275552, + "theoretical_loss": 3.486958068647425, + "tokens_seen": 1649815552 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025262788365095287, + "loss": 2.7372, + "theoretical_loss": 3.486958068647425, + "tokens_seen": 1649815552 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025261785356068205, + "loss": 2.8229, + "theoretical_loss": 3.4869460782294976, + "tokens_seen": 1649881088 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025260782347041123, + "loss": 2.6759, + "theoretical_loss": 3.486934088421192, + "tokens_seen": 1649946624 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525977933801404, + "loss": 2.917, + "theoretical_loss": 3.486922099222453, + "tokens_seen": 1650012160 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525877632898696, + "loss": 2.6458, + "theoretical_loss": 3.486910110633226, + "tokens_seen": 1650077696 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025257773319959883, + "loss": 2.7642, + "theoretical_loss": 3.4868981226534546, + "tokens_seen": 1650143232 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025256770310932796, + "loss": 2.8599, + "theoretical_loss": 3.4868861352830844, + "tokens_seen": 1650208768 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525576730190572, + "loss": 2.8468, + "theoretical_loss": 3.4868741485220607, + "tokens_seen": 1650274304 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525476429287864, + "loss": 2.7261, + "theoretical_loss": 3.486862162370327, + "tokens_seen": 1650339840 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025253761283851556, + "loss": 2.8838, + "theoretical_loss": 3.4868501768278293, + "tokens_seen": 1650405376 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025252758274824474, + "loss": 2.6575, + "theoretical_loss": 3.4868381918945115, + "tokens_seen": 1650470912 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525175526579739, + "loss": 2.6323, + "theoretical_loss": 3.48682620757032, + "tokens_seen": 1650536448 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002525075225677031, + "loss": 2.7332, + "theoretical_loss": 3.4868142238551973, + "tokens_seen": 1650601984 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025249749247743233, + "loss": 2.8637, + "theoretical_loss": 3.48680224074909, + "tokens_seen": 1650667520 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025248746238716146, + "loss": 2.6676, + "theoretical_loss": 3.486790258251943, + "tokens_seen": 1650733056 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002524774322968907, + "loss": 2.9204, + "theoretical_loss": 3.4867782763637005, + "tokens_seen": 1650798592 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002524674022066198, + "loss": 2.8117, + "theoretical_loss": 3.4867662950843075, + "tokens_seen": 1650864128 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025245737211634906, + "loss": 2.8454, + "theoretical_loss": 3.486754314413709, + "tokens_seen": 1650929664 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025244734202607824, + "loss": 2.855, + "theoretical_loss": 3.48674233435185, + "tokens_seen": 1650995200 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002524373119358074, + "loss": 2.8345, + "theoretical_loss": 3.4867303548986754, + "tokens_seen": 1651060736 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002524272818455366, + "loss": 2.7387, + "theoretical_loss": 3.4867183760541303, + "tokens_seen": 1651126272 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002524172517552658, + "loss": 2.7824, + "theoretical_loss": 3.4867063978181587, + "tokens_seen": 1651191808 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025240722166499496, + "loss": 2.6865, + "theoretical_loss": 3.486694420190707, + "tokens_seen": 1651257344 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523971915747242, + "loss": 2.6665, + "theoretical_loss": 3.486682443171719, + "tokens_seen": 1651322880 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523871614844533, + "loss": 2.6568, + "theoretical_loss": 3.4866704667611397, + "tokens_seen": 1651388416 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2632634, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9229941368103027, + "objective/train/theoretical_loss": 3.4866584909589147, + "objective/train/tokens_used": 1671913952, + "theoretical_loss": 3.4866584909589147, + "tokens_seen": 1651453952 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025237713139418256, + "loss": 2.8844, + "theoretical_loss": 3.4866584909589147, + "tokens_seen": 1651453952 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025236710130391174, + "loss": 2.678, + "theoretical_loss": 3.4866465157649884, + "tokens_seen": 1651519488 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523570712136409, + "loss": 2.7309, + "theoretical_loss": 3.4866345411793054, + "tokens_seen": 1651585024 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523470411233701, + "loss": 2.9278, + "theoretical_loss": 3.486622567201812, + "tokens_seen": 1651650560 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523370110330993, + "loss": 2.7682, + "theoretical_loss": 3.4866105938324523, + "tokens_seen": 1651716096 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025232698094282847, + "loss": 2.8521, + "theoretical_loss": 3.4865986210711712, + "tokens_seen": 1651781632 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002523169508525577, + "loss": 2.7875, + "theoretical_loss": 3.486586648917914, + "tokens_seen": 1651847168 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025230692076228683, + "loss": 2.6125, + "theoretical_loss": 3.486574677372625, + "tokens_seen": 1651912704 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025229689067201606, + "loss": 2.8222, + "theoretical_loss": 3.4865627064352505, + "tokens_seen": 1651978240 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002522868605817452, + "loss": 2.7805, + "theoretical_loss": 3.4865507361057344, + "tokens_seen": 1652043776 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025227683049147443, + "loss": 2.736, + "theoretical_loss": 3.4865387663840224, + "tokens_seen": 1652109312 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002522668004012036, + "loss": 2.7694, + "theoretical_loss": 3.4865267972700593, + "tokens_seen": 1652174848 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002522567703109328, + "loss": 2.7771, + "theoretical_loss": 3.4865148287637897, + "tokens_seen": 1652240384 + }, + { + "epoch": 4.09, + "learning_rate": 0.000252246740220662, + "loss": 2.8521, + "theoretical_loss": 3.4865028608651594, + "tokens_seen": 1652305920 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025223671013039115, + "loss": 2.7245, + "theoretical_loss": 3.486490893574113, + "tokens_seen": 1652371456 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002522266800401204, + "loss": 2.6635, + "theoretical_loss": 3.4864789268905954, + "tokens_seen": 1652436992 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025221664994984957, + "loss": 2.8847, + "theoretical_loss": 3.4864669608145524, + "tokens_seen": 1652502528 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025220661985957875, + "loss": 2.6911, + "theoretical_loss": 3.486454995345928, + "tokens_seen": 1652568064 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025219658976930793, + "loss": 2.6048, + "theoretical_loss": 3.4864430304846685, + "tokens_seen": 1652633600 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025218655967903717, + "loss": 2.8093, + "theoretical_loss": 3.4864310662307174, + "tokens_seen": 1652699136 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002521765295887663, + "loss": 2.8384, + "theoretical_loss": 3.486419102584022, + "tokens_seen": 1652764672 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025216649949849553, + "loss": 2.7539, + "theoretical_loss": 3.486407139544525, + "tokens_seen": 1652830208 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025215646940822465, + "loss": 2.8075, + "theoretical_loss": 3.4863951771121733, + "tokens_seen": 1652895744 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002521464393179539, + "loss": 2.7797, + "theoretical_loss": 3.486383215286911, + "tokens_seen": 1652961280 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025213640922768307, + "loss": 2.8507, + "theoretical_loss": 3.486371254068684, + "tokens_seen": 1653026816 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2635279, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.613892078399658, + "objective/train/theoretical_loss": 3.486359293457437, + "objective/train/tokens_used": 1673552352, + "theoretical_loss": 3.486359293457437, + "tokens_seen": 1653092352 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025212637913741225, + "loss": 2.9237, + "theoretical_loss": 3.486359293457437, + "tokens_seen": 1653092352 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025211634904714143, + "loss": 2.9341, + "theoretical_loss": 3.4863473334531148, + "tokens_seen": 1653157888 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002521063189568706, + "loss": 2.8886, + "theoretical_loss": 3.4863353740556633, + "tokens_seen": 1653223424 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002520962888665998, + "loss": 2.7954, + "theoretical_loss": 3.486323415265027, + "tokens_seen": 1653288960 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025208625877632903, + "loss": 2.8273, + "theoretical_loss": 3.4863114570811513, + "tokens_seen": 1653354496 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025207622868605816, + "loss": 2.7384, + "theoretical_loss": 3.486299499503981, + "tokens_seen": 1653420032 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002520661985957874, + "loss": 2.7659, + "theoretical_loss": 3.4862875425334625, + "tokens_seen": 1653485568 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002520561685055166, + "loss": 2.7941, + "theoretical_loss": 3.4862755861695396, + "tokens_seen": 1653551104 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025204613841524576, + "loss": 2.615, + "theoretical_loss": 3.486263630412158, + "tokens_seen": 1653616640 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025203610832497494, + "loss": 2.7252, + "theoretical_loss": 3.486251675261263, + "tokens_seen": 1653682176 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002520260782347041, + "loss": 2.7546, + "theoretical_loss": 3.4862397207168, + "tokens_seen": 1653747712 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002520160481444333, + "loss": 2.7659, + "theoretical_loss": 3.486227766778714, + "tokens_seen": 1653813248 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025200601805416253, + "loss": 2.837, + "theoretical_loss": 3.4862158134469494, + "tokens_seen": 1653878784 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025199598796389166, + "loss": 2.6899, + "theoretical_loss": 3.4862038607214525, + "tokens_seen": 1653944320 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002519859578736209, + "loss": 2.6738, + "theoretical_loss": 3.4861919086021684, + "tokens_seen": 1654009856 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025197592778335, + "loss": 2.8224, + "theoretical_loss": 3.4861799570890417, + "tokens_seen": 1654075392 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025196589769307926, + "loss": 2.6775, + "theoretical_loss": 3.4861680061820186, + "tokens_seen": 1654140928 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025195586760280844, + "loss": 2.6947, + "theoretical_loss": 3.486156055881043, + "tokens_seen": 1654206464 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002519458375125376, + "loss": 2.7898, + "theoretical_loss": 3.4861441061860616, + "tokens_seen": 1654272000 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002519358074222668, + "loss": 2.6639, + "theoretical_loss": 3.486132157097019, + "tokens_seen": 1654337536 + }, + { + "epoch": 4.09, + "learning_rate": 0.000251925777331996, + "loss": 2.751, + "theoretical_loss": 3.48612020861386, + "tokens_seen": 1654403072 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025191574724172516, + "loss": 2.7743, + "theoretical_loss": 3.4861082607365312, + "tokens_seen": 1654468608 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002519057171514544, + "loss": 2.7232, + "theoretical_loss": 3.4860963134649765, + "tokens_seen": 1654534144 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002518956870611835, + "loss": 2.8475, + "theoretical_loss": 3.486084366799142, + "tokens_seen": 1654599680 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025188565697091276, + "loss": 2.7966, + "theoretical_loss": 3.486072420738973, + "tokens_seen": 1654665216 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2638273, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6086912155151367, + "objective/train/theoretical_loss": 3.4860604752844147, + "objective/train/tokens_used": 1675190752, + "theoretical_loss": 3.4860604752844147, + "tokens_seen": 1654730752 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025187562688064194, + "loss": 2.6462, + "theoretical_loss": 3.4860604752844147, + "tokens_seen": 1654730752 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002518655967903711, + "loss": 2.6908, + "theoretical_loss": 3.4860485304354114, + "tokens_seen": 1654796288 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002518555667001003, + "loss": 2.8044, + "theoretical_loss": 3.48603658619191, + "tokens_seen": 1654861824 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002518455366098295, + "loss": 2.6837, + "theoretical_loss": 3.486024642553855, + "tokens_seen": 1654927360 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025183550651955867, + "loss": 2.7373, + "theoretical_loss": 3.4860126995211917, + "tokens_seen": 1654992896 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002518254764292879, + "loss": 2.7139, + "theoretical_loss": 3.486000757093866, + "tokens_seen": 1655058432 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025181544633901703, + "loss": 2.8763, + "theoretical_loss": 3.4859888152718224, + "tokens_seen": 1655123968 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025180541624874627, + "loss": 2.7593, + "theoretical_loss": 3.485976874055007, + "tokens_seen": 1655189504 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002517953861584754, + "loss": 2.7983, + "theoretical_loss": 3.4859649334433653, + "tokens_seen": 1655255040 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025178535606820463, + "loss": 2.7179, + "theoretical_loss": 3.4859529934368414, + "tokens_seen": 1655320576 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002517753259779338, + "loss": 2.8001, + "theoretical_loss": 3.485941054035382, + "tokens_seen": 1655386112 + }, + { + "epoch": 4.09, + "learning_rate": 0.000251765295887663, + "loss": 2.6482, + "theoretical_loss": 3.4859291152389322, + "tokens_seen": 1655451648 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025175526579739217, + "loss": 2.7656, + "theoretical_loss": 3.485917177047437, + "tokens_seen": 1655517184 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025174523570712135, + "loss": 2.8278, + "theoretical_loss": 3.485905239460842, + "tokens_seen": 1655582720 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025173520561685053, + "loss": 2.9139, + "theoretical_loss": 3.4858933024790932, + "tokens_seen": 1655648256 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025172517552657977, + "loss": 2.7737, + "theoretical_loss": 3.4858813661021344, + "tokens_seen": 1655713792 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002517151454363089, + "loss": 2.7246, + "theoretical_loss": 3.485869430329913, + "tokens_seen": 1655779328 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025170511534603813, + "loss": 2.7368, + "theoretical_loss": 3.485857495162373, + "tokens_seen": 1655844864 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002516950852557673, + "loss": 2.7093, + "theoretical_loss": 3.4858455605994605, + "tokens_seen": 1655910400 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002516850551654965, + "loss": 2.6423, + "theoretical_loss": 3.4858336266411207, + "tokens_seen": 1655975936 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002516750250752257, + "loss": 2.7178, + "theoretical_loss": 3.485821693287299, + "tokens_seen": 1656041472 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025166499498495486, + "loss": 2.5817, + "theoretical_loss": 3.4858097605379412, + "tokens_seen": 1656107008 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025165496489468404, + "loss": 2.8056, + "theoretical_loss": 3.4857978283929922, + "tokens_seen": 1656172544 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025164493480441327, + "loss": 2.8081, + "theoretical_loss": 3.4857858968523985, + "tokens_seen": 1656238080 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002516349047141424, + "loss": 2.6555, + "theoretical_loss": 3.4857739659161044, + "tokens_seen": 1656303616 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2641271, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5274267196655273, + "objective/train/theoretical_loss": 3.485762035584056, + "objective/train/tokens_used": 1676829152, + "theoretical_loss": 3.485762035584056, + "tokens_seen": 1656369152 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025162487462387163, + "loss": 2.8272, + "theoretical_loss": 3.485762035584056, + "tokens_seen": 1656369152 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025161484453360076, + "loss": 2.6011, + "theoretical_loss": 3.4857501058561984, + "tokens_seen": 1656434688 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025160481444333, + "loss": 2.8396, + "theoretical_loss": 3.4857381767324775, + "tokens_seen": 1656500224 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002515947843530592, + "loss": 2.7867, + "theoretical_loss": 3.485726248212839, + "tokens_seen": 1656565760 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025158475426278836, + "loss": 2.8493, + "theoretical_loss": 3.4857143202972276, + "tokens_seen": 1656631296 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025157472417251754, + "loss": 2.7794, + "theoretical_loss": 3.4857023929855897, + "tokens_seen": 1656696832 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002515646940822468, + "loss": 2.7728, + "theoretical_loss": 3.4856904662778705, + "tokens_seen": 1656762368 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002515546639919759, + "loss": 2.6777, + "theoretical_loss": 3.485678540174015, + "tokens_seen": 1656827904 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025154463390170514, + "loss": 2.9714, + "theoretical_loss": 3.4856666146739697, + "tokens_seen": 1656893440 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025153460381143426, + "loss": 2.7602, + "theoretical_loss": 3.4856546897776792, + "tokens_seen": 1656958976 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002515245737211635, + "loss": 2.5663, + "theoretical_loss": 3.48564276548509, + "tokens_seen": 1657024512 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002515145436308927, + "loss": 2.7833, + "theoretical_loss": 3.4856308417961466, + "tokens_seen": 1657090048 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025150451354062186, + "loss": 2.8451, + "theoretical_loss": 3.485618918710795, + "tokens_seen": 1657155584 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514944834503511, + "loss": 2.5886, + "theoretical_loss": 3.4856069962289817, + "tokens_seen": 1657221120 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514844533600802, + "loss": 2.7919, + "theoretical_loss": 3.4855950743506514, + "tokens_seen": 1657286656 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025147442326980946, + "loss": 2.8536, + "theoretical_loss": 3.4855831530757495, + "tokens_seen": 1657352192 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025146439317953864, + "loss": 2.7775, + "theoretical_loss": 3.4855712324042223, + "tokens_seen": 1657417728 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514543630892678, + "loss": 2.8506, + "theoretical_loss": 3.4855593123360142, + "tokens_seen": 1657483264 + }, + { + "epoch": 4.09, + "learning_rate": 0.000251444332998997, + "loss": 2.6839, + "theoretical_loss": 3.485547392871072, + "tokens_seen": 1657548800 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514343029087262, + "loss": 2.8293, + "theoretical_loss": 3.4855354740093416, + "tokens_seen": 1657614336 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025142427281845536, + "loss": 2.6217, + "theoretical_loss": 3.485523555750767, + "tokens_seen": 1657679872 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514142427281846, + "loss": 2.7716, + "theoretical_loss": 3.4855116380952955, + "tokens_seen": 1657745408 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002514042126379137, + "loss": 2.8282, + "theoretical_loss": 3.4854997210428715, + "tokens_seen": 1657810944 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025139418254764296, + "loss": 2.8307, + "theoretical_loss": 3.4854878045934417, + "tokens_seen": 1657876480 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025138415245737214, + "loss": 2.8555, + "theoretical_loss": 3.485475888746951, + "tokens_seen": 1657942016 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2642760, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.926617383956909, + "objective/train/theoretical_loss": 3.485463973503345, + "objective/train/tokens_used": 1678467552, + "theoretical_loss": 3.485463973503345, + "tokens_seen": 1658007552 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002513741223671013, + "loss": 2.9161, + "theoretical_loss": 3.485463973503345, + "tokens_seen": 1658007552 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002513640922768305, + "loss": 2.6603, + "theoretical_loss": 3.48545205886257, + "tokens_seen": 1658073088 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002513540621865597, + "loss": 2.72, + "theoretical_loss": 3.485440144824571, + "tokens_seen": 1658138624 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025134403209628887, + "loss": 2.8483, + "theoretical_loss": 3.485428231389294, + "tokens_seen": 1658204160 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002513340020060181, + "loss": 2.8451, + "theoretical_loss": 3.4854163185566853, + "tokens_seen": 1658269696 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025132397191574723, + "loss": 2.7449, + "theoretical_loss": 3.4854044063266896, + "tokens_seen": 1658335232 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025131394182547647, + "loss": 2.5834, + "theoretical_loss": 3.485392494699253, + "tokens_seen": 1658400768 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002513039117352056, + "loss": 2.9013, + "theoretical_loss": 3.4853805836743215, + "tokens_seen": 1658466304 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025129388164493483, + "loss": 2.8144, + "theoretical_loss": 3.4853686732518403, + "tokens_seen": 1658531840 + }, + { + "epoch": 4.09, + "learning_rate": 0.000251283851554664, + "loss": 2.7341, + "theoretical_loss": 3.4853567634317555, + "tokens_seen": 1658597376 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002512738214643932, + "loss": 2.8074, + "theoretical_loss": 3.4853448542140124, + "tokens_seen": 1658662912 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025126379137412237, + "loss": 2.7925, + "theoretical_loss": 3.4853329455985573, + "tokens_seen": 1658728448 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025125376128385155, + "loss": 2.755, + "theoretical_loss": 3.4853210375853356, + "tokens_seen": 1658793984 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025124373119358073, + "loss": 2.7272, + "theoretical_loss": 3.4853091301742936, + "tokens_seen": 1658859520 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025123370110330997, + "loss": 2.7254, + "theoretical_loss": 3.485297223365376, + "tokens_seen": 1658925056 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002512236710130391, + "loss": 2.8036, + "theoretical_loss": 3.485285317158529, + "tokens_seen": 1658990592 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025121364092276833, + "loss": 2.8889, + "theoretical_loss": 3.485273411553699, + "tokens_seen": 1659056128 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002512036108324975, + "loss": 2.7781, + "theoretical_loss": 3.485261506550831, + "tokens_seen": 1659121664 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002511935807422267, + "loss": 2.6745, + "theoretical_loss": 3.4852496021498713, + "tokens_seen": 1659187200 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002511835506519559, + "loss": 2.7672, + "theoretical_loss": 3.4852376983507654, + "tokens_seen": 1659252736 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025117352056168506, + "loss": 2.7816, + "theoretical_loss": 3.4852257951534593, + "tokens_seen": 1659318272 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025116349047141424, + "loss": 2.8338, + "theoretical_loss": 3.4852138925578986, + "tokens_seen": 1659383808 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025115346038114347, + "loss": 2.7992, + "theoretical_loss": 3.48520199056403, + "tokens_seen": 1659449344 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002511434302908726, + "loss": 2.8746, + "theoretical_loss": 3.4851900891717973, + "tokens_seen": 1659514880 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025113340020060183, + "loss": 2.5625, + "theoretical_loss": 3.4851781883811483, + "tokens_seen": 1659580416 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2645641, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6898205280303955, + "objective/train/theoretical_loss": 3.485166288192028, + "objective/train/tokens_used": 1680105952, + "theoretical_loss": 3.485166288192028, + "tokens_seen": 1659645952 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025112337011033096, + "loss": 2.8185, + "theoretical_loss": 3.485166288192028, + "tokens_seen": 1659645952 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002511133400200602, + "loss": 2.7123, + "theoretical_loss": 3.485154388604382, + "tokens_seen": 1659711488 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002511033099297894, + "loss": 2.7908, + "theoretical_loss": 3.4851424896181573, + "tokens_seen": 1659777024 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025109327983951856, + "loss": 2.6338, + "theoretical_loss": 3.4851305912332986, + "tokens_seen": 1659842560 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025108324974924774, + "loss": 2.8337, + "theoretical_loss": 3.485118693449752, + "tokens_seen": 1659908096 + }, + { + "epoch": 4.09, + "learning_rate": 0.000251073219658977, + "loss": 2.5754, + "theoretical_loss": 3.4851067962674636, + "tokens_seen": 1659973632 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002510631895687061, + "loss": 2.8667, + "theoretical_loss": 3.485094899686379, + "tokens_seen": 1660039168 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025105315947843534, + "loss": 2.5967, + "theoretical_loss": 3.4850830037064444, + "tokens_seen": 1660104704 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025104312938816446, + "loss": 2.8283, + "theoretical_loss": 3.4850711083276056, + "tokens_seen": 1660170240 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002510330992978937, + "loss": 2.6928, + "theoretical_loss": 3.4850592135498086, + "tokens_seen": 1660235776 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002510230692076229, + "loss": 2.9249, + "theoretical_loss": 3.485047319372999, + "tokens_seen": 1660301312 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025101303911735206, + "loss": 2.8245, + "theoretical_loss": 3.4850354257971228, + "tokens_seen": 1660366848 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025100300902708124, + "loss": 2.8422, + "theoretical_loss": 3.485023532822126, + "tokens_seen": 1660432384 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002509929789368104, + "loss": 2.8673, + "theoretical_loss": 3.485011640447955, + "tokens_seen": 1660497920 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002509829488465396, + "loss": 2.5777, + "theoretical_loss": 3.484999748674555, + "tokens_seen": 1660563456 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025097291875626884, + "loss": 2.7197, + "theoretical_loss": 3.484987857501872, + "tokens_seen": 1660628992 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025096288866599797, + "loss": 2.69, + "theoretical_loss": 3.484975966929852, + "tokens_seen": 1660694528 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002509528585757272, + "loss": 2.9071, + "theoretical_loss": 3.484964076958442, + "tokens_seen": 1660760064 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025094282848545633, + "loss": 2.6961, + "theoretical_loss": 3.484952187587586, + "tokens_seen": 1660825600 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025093279839518556, + "loss": 2.7542, + "theoretical_loss": 3.484940298817232, + "tokens_seen": 1660891136 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025092276830491475, + "loss": 2.8962, + "theoretical_loss": 3.4849284106473246, + "tokens_seen": 1660956672 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002509127382146439, + "loss": 2.721, + "theoretical_loss": 3.4849165230778105, + "tokens_seen": 1661022208 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002509027081243731, + "loss": 2.727, + "theoretical_loss": 3.4849046361086353, + "tokens_seen": 1661087744 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025089267803410234, + "loss": 2.9719, + "theoretical_loss": 3.4848927497397444, + "tokens_seen": 1661153280 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025088264794383147, + "loss": 2.7782, + "theoretical_loss": 3.4848808639710853, + "tokens_seen": 1661218816 + }, + { + "epoch": 4.09, + "objective/train/docs_used": 2648038, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8230135440826416, + "objective/train/theoretical_loss": 3.484868978802603, + "objective/train/tokens_used": 1681744352, + "theoretical_loss": 3.484868978802603, + "tokens_seen": 1661284352 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002508726178535607, + "loss": 2.7993, + "theoretical_loss": 3.484868978802603, + "tokens_seen": 1661284352 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025086258776328983, + "loss": 2.8324, + "theoretical_loss": 3.4848570942342443, + "tokens_seen": 1661349888 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025085255767301907, + "loss": 2.7131, + "theoretical_loss": 3.484845210265954, + "tokens_seen": 1661415424 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025084252758274825, + "loss": 2.7464, + "theoretical_loss": 3.4848333268976788, + "tokens_seen": 1661480960 + }, + { + "epoch": 4.09, + "learning_rate": 0.00025083249749247743, + "loss": 2.7884, + "theoretical_loss": 3.4848214441293646, + "tokens_seen": 1661546496 + }, + { + "epoch": 4.09, + "learning_rate": 0.0002508224674022066, + "loss": 2.7965, + "theoretical_loss": 3.484809561960958, + "tokens_seen": 1661612032 + } + ], + "max_steps": 50354, + "num_train_epochs": 9223372036854775807, + "total_flos": 8.47980437962752e+17, + "trial_name": null, + "trial_params": null +}